1 /*
2 * Resizable virtual memory filesystem for Linux.
3 *
4 * Copyright (C) 2000 Linus Torvalds.
5 * 2000 Transmeta Corp.
6 * 2000-2001 Christoph Rohland
7 * 2000-2001 SAP AG
8 * 2002 Red Hat Inc.
9 * Copyright (C) 2002-2011 Hugh Dickins.
10 * Copyright (C) 2011 Google Inc.
11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
13 *
14 * Extended attribute support for tmpfs:
15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
17 *
18 * tiny-shmem:
19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20 *
21 * This file is released under the GPL.
22 */
23
24 #include <linux/fs.h>
25 #include <linux/init.h>
26 #include <linux/vfs.h>
27 #include <linux/mount.h>
28 #include <linux/ramfs.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include <linux/fileattr.h>
32 #include <linux/mm.h>
33 #include <linux/random.h>
34 #include <linux/sched/signal.h>
35 #include <linux/export.h>
36 #include <linux/shmem_fs.h>
37 #include <linux/swap.h>
38 #include <linux/uio.h>
39 #include <linux/hugetlb.h>
40 #include <linux/fs_parser.h>
41 #include <linux/swapfile.h>
42 #include <linux/iversion.h>
43 #include <linux/unicode.h>
44 #include "swap.h"
45
46 static struct vfsmount *shm_mnt __ro_after_init;
47
48 #ifdef CONFIG_SHMEM
49 /*
50 * This virtual memory filesystem is heavily based on the ramfs. It
51 * extends ramfs by the ability to use swap and honor resource limits
52 * which makes it a completely usable filesystem.
53 */
54
55 #include <linux/xattr.h>
56 #include <linux/exportfs.h>
57 #include <linux/posix_acl.h>
58 #include <linux/posix_acl_xattr.h>
59 #include <linux/mman.h>
60 #include <linux/string.h>
61 #include <linux/slab.h>
62 #include <linux/backing-dev.h>
63 #include <linux/writeback.h>
64 #include <linux/pagevec.h>
65 #include <linux/percpu_counter.h>
66 #include <linux/falloc.h>
67 #include <linux/splice.h>
68 #include <linux/security.h>
69 #include <linux/swapops.h>
70 #include <linux/mempolicy.h>
71 #include <linux/namei.h>
72 #include <linux/ctype.h>
73 #include <linux/migrate.h>
74 #include <linux/highmem.h>
75 #include <linux/seq_file.h>
76 #include <linux/magic.h>
77 #include <linux/syscalls.h>
78 #include <linux/fcntl.h>
79 #include <uapi/linux/memfd.h>
80 #include <linux/rmap.h>
81 #include <linux/uuid.h>
82 #include <linux/quotaops.h>
83 #include <linux/rcupdate_wait.h>
84
85 #include <linux/uaccess.h>
86
87 #include "internal.h"
88
89 #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
90
91 /* Pretend that each entry is of this size in directory's i_size */
92 #define BOGO_DIRENT_SIZE 20
93
94 /* Pretend that one inode + its dentry occupy this much memory */
95 #define BOGO_INODE_SIZE 1024
96
97 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
98 #define SHORT_SYMLINK_LEN 128
99
100 /*
101 * shmem_fallocate communicates with shmem_fault or shmem_writeout via
102 * inode->i_private (with i_rwsem making sure that it has only one user at
103 * a time): we would prefer not to enlarge the shmem inode just for that.
104 */
105 struct shmem_falloc {
106 wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
107 pgoff_t start; /* start of range currently being fallocated */
108 pgoff_t next; /* the next page offset to be fallocated */
109 pgoff_t nr_falloced; /* how many new pages have been fallocated */
110 pgoff_t nr_unswapped; /* how often writeout refused to swap out */
111 };
112
113 struct shmem_options {
114 unsigned long long blocks;
115 unsigned long long inodes;
116 struct mempolicy *mpol;
117 kuid_t uid;
118 kgid_t gid;
119 umode_t mode;
120 bool full_inums;
121 int huge;
122 int seen;
123 bool noswap;
124 unsigned short quota_types;
125 struct shmem_quota_limits qlimits;
126 #if IS_ENABLED(CONFIG_UNICODE)
127 struct unicode_map *encoding;
128 bool strict_encoding;
129 #endif
130 #define SHMEM_SEEN_BLOCKS 1
131 #define SHMEM_SEEN_INODES 2
132 #define SHMEM_SEEN_HUGE 4
133 #define SHMEM_SEEN_INUMS 8
134 #define SHMEM_SEEN_NOSWAP 16
135 #define SHMEM_SEEN_QUOTA 32
136 };
137
138 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
139 static unsigned long huge_shmem_orders_always __read_mostly;
140 static unsigned long huge_shmem_orders_madvise __read_mostly;
141 static unsigned long huge_shmem_orders_inherit __read_mostly;
142 static unsigned long huge_shmem_orders_within_size __read_mostly;
143 static bool shmem_orders_configured __initdata;
144 #endif
145
146 #ifdef CONFIG_TMPFS
shmem_default_max_blocks(void)147 static unsigned long shmem_default_max_blocks(void)
148 {
149 return totalram_pages() / 2;
150 }
151
shmem_default_max_inodes(void)152 static unsigned long shmem_default_max_inodes(void)
153 {
154 unsigned long nr_pages = totalram_pages();
155
156 return min3(nr_pages - totalhigh_pages(), nr_pages / 2,
157 ULONG_MAX / BOGO_INODE_SIZE);
158 }
159 #endif
160
161 static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
162 struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
163 struct vm_area_struct *vma, vm_fault_t *fault_type);
164
SHMEM_SB(struct super_block * sb)165 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
166 {
167 return sb->s_fs_info;
168 }
169
170 /*
171 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
172 * for shared memory and for shared anonymous (/dev/zero) mappings
173 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
174 * consistent with the pre-accounting of private mappings ...
175 */
shmem_acct_size(unsigned long flags,loff_t size)176 static inline int shmem_acct_size(unsigned long flags, loff_t size)
177 {
178 return (flags & VM_NORESERVE) ?
179 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
180 }
181
shmem_unacct_size(unsigned long flags,loff_t size)182 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
183 {
184 if (!(flags & VM_NORESERVE))
185 vm_unacct_memory(VM_ACCT(size));
186 }
187
shmem_reacct_size(unsigned long flags,loff_t oldsize,loff_t newsize)188 static inline int shmem_reacct_size(unsigned long flags,
189 loff_t oldsize, loff_t newsize)
190 {
191 if (!(flags & VM_NORESERVE)) {
192 if (VM_ACCT(newsize) > VM_ACCT(oldsize))
193 return security_vm_enough_memory_mm(current->mm,
194 VM_ACCT(newsize) - VM_ACCT(oldsize));
195 else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
196 vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
197 }
198 return 0;
199 }
200
201 /*
202 * ... whereas tmpfs objects are accounted incrementally as
203 * pages are allocated, in order to allow large sparse files.
204 * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
205 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
206 */
shmem_acct_blocks(unsigned long flags,long pages)207 static inline int shmem_acct_blocks(unsigned long flags, long pages)
208 {
209 if (!(flags & VM_NORESERVE))
210 return 0;
211
212 return security_vm_enough_memory_mm(current->mm,
213 pages * VM_ACCT(PAGE_SIZE));
214 }
215
shmem_unacct_blocks(unsigned long flags,long pages)216 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
217 {
218 if (flags & VM_NORESERVE)
219 vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
220 }
221
shmem_inode_acct_blocks(struct inode * inode,long pages)222 static int shmem_inode_acct_blocks(struct inode *inode, long pages)
223 {
224 struct shmem_inode_info *info = SHMEM_I(inode);
225 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
226 int err = -ENOSPC;
227
228 if (shmem_acct_blocks(info->flags, pages))
229 return err;
230
231 might_sleep(); /* when quotas */
232 if (sbinfo->max_blocks) {
233 if (!percpu_counter_limited_add(&sbinfo->used_blocks,
234 sbinfo->max_blocks, pages))
235 goto unacct;
236
237 err = dquot_alloc_block_nodirty(inode, pages);
238 if (err) {
239 percpu_counter_sub(&sbinfo->used_blocks, pages);
240 goto unacct;
241 }
242 } else {
243 err = dquot_alloc_block_nodirty(inode, pages);
244 if (err)
245 goto unacct;
246 }
247
248 return 0;
249
250 unacct:
251 shmem_unacct_blocks(info->flags, pages);
252 return err;
253 }
254
shmem_inode_unacct_blocks(struct inode * inode,long pages)255 static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
256 {
257 struct shmem_inode_info *info = SHMEM_I(inode);
258 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
259
260 might_sleep(); /* when quotas */
261 dquot_free_block_nodirty(inode, pages);
262
263 if (sbinfo->max_blocks)
264 percpu_counter_sub(&sbinfo->used_blocks, pages);
265 shmem_unacct_blocks(info->flags, pages);
266 }
267
268 static const struct super_operations shmem_ops;
269 static const struct address_space_operations shmem_aops;
270 static const struct file_operations shmem_file_operations;
271 static const struct inode_operations shmem_inode_operations;
272 static const struct inode_operations shmem_dir_inode_operations;
273 static const struct inode_operations shmem_special_inode_operations;
274 static const struct vm_operations_struct shmem_vm_ops;
275 static const struct vm_operations_struct shmem_anon_vm_ops;
276 static struct file_system_type shmem_fs_type;
277
shmem_mapping(struct address_space * mapping)278 bool shmem_mapping(struct address_space *mapping)
279 {
280 return mapping->a_ops == &shmem_aops;
281 }
282 EXPORT_SYMBOL_GPL(shmem_mapping);
283
vma_is_anon_shmem(struct vm_area_struct * vma)284 bool vma_is_anon_shmem(struct vm_area_struct *vma)
285 {
286 return vma->vm_ops == &shmem_anon_vm_ops;
287 }
288
vma_is_shmem(struct vm_area_struct * vma)289 bool vma_is_shmem(struct vm_area_struct *vma)
290 {
291 return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
292 }
293
294 static LIST_HEAD(shmem_swaplist);
295 static DEFINE_SPINLOCK(shmem_swaplist_lock);
296
297 #ifdef CONFIG_TMPFS_QUOTA
298
shmem_enable_quotas(struct super_block * sb,unsigned short quota_types)299 static int shmem_enable_quotas(struct super_block *sb,
300 unsigned short quota_types)
301 {
302 int type, err = 0;
303
304 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
305 for (type = 0; type < SHMEM_MAXQUOTAS; type++) {
306 if (!(quota_types & (1 << type)))
307 continue;
308 err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
309 DQUOT_USAGE_ENABLED |
310 DQUOT_LIMITS_ENABLED);
311 if (err)
312 goto out_err;
313 }
314 return 0;
315
316 out_err:
317 pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
318 type, err);
319 for (type--; type >= 0; type--)
320 dquot_quota_off(sb, type);
321 return err;
322 }
323
shmem_disable_quotas(struct super_block * sb)324 static void shmem_disable_quotas(struct super_block *sb)
325 {
326 int type;
327
328 for (type = 0; type < SHMEM_MAXQUOTAS; type++)
329 dquot_quota_off(sb, type);
330 }
331
shmem_get_dquots(struct inode * inode)332 static struct dquot __rcu **shmem_get_dquots(struct inode *inode)
333 {
334 return SHMEM_I(inode)->i_dquot;
335 }
336 #endif /* CONFIG_TMPFS_QUOTA */
337
338 /*
339 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
340 * produces a novel ino for the newly allocated inode.
341 *
342 * It may also be called when making a hard link to permit the space needed by
343 * each dentry. However, in that case, no new inode number is needed since that
344 * internally draws from another pool of inode numbers (currently global
345 * get_next_ino()). This case is indicated by passing NULL as inop.
346 */
347 #define SHMEM_INO_BATCH 1024
shmem_reserve_inode(struct super_block * sb,ino_t * inop)348 static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
349 {
350 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
351 ino_t ino;
352
353 if (!(sb->s_flags & SB_KERNMOUNT)) {
354 raw_spin_lock(&sbinfo->stat_lock);
355 if (sbinfo->max_inodes) {
356 if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
357 raw_spin_unlock(&sbinfo->stat_lock);
358 return -ENOSPC;
359 }
360 sbinfo->free_ispace -= BOGO_INODE_SIZE;
361 }
362 if (inop) {
363 ino = sbinfo->next_ino++;
364 if (unlikely(is_zero_ino(ino)))
365 ino = sbinfo->next_ino++;
366 if (unlikely(!sbinfo->full_inums &&
367 ino > UINT_MAX)) {
368 /*
369 * Emulate get_next_ino uint wraparound for
370 * compatibility
371 */
372 if (IS_ENABLED(CONFIG_64BIT))
373 pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
374 __func__, MINOR(sb->s_dev));
375 sbinfo->next_ino = 1;
376 ino = sbinfo->next_ino++;
377 }
378 *inop = ino;
379 }
380 raw_spin_unlock(&sbinfo->stat_lock);
381 } else if (inop) {
382 /*
383 * __shmem_file_setup, one of our callers, is lock-free: it
384 * doesn't hold stat_lock in shmem_reserve_inode since
385 * max_inodes is always 0, and is called from potentially
386 * unknown contexts. As such, use a per-cpu batched allocator
387 * which doesn't require the per-sb stat_lock unless we are at
388 * the batch boundary.
389 *
390 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
391 * shmem mounts are not exposed to userspace, so we don't need
392 * to worry about things like glibc compatibility.
393 */
394 ino_t *next_ino;
395
396 next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
397 ino = *next_ino;
398 if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
399 raw_spin_lock(&sbinfo->stat_lock);
400 ino = sbinfo->next_ino;
401 sbinfo->next_ino += SHMEM_INO_BATCH;
402 raw_spin_unlock(&sbinfo->stat_lock);
403 if (unlikely(is_zero_ino(ino)))
404 ino++;
405 }
406 *inop = ino;
407 *next_ino = ++ino;
408 put_cpu();
409 }
410
411 return 0;
412 }
413
shmem_free_inode(struct super_block * sb,size_t freed_ispace)414 static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
415 {
416 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
417 if (sbinfo->max_inodes) {
418 raw_spin_lock(&sbinfo->stat_lock);
419 sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
420 raw_spin_unlock(&sbinfo->stat_lock);
421 }
422 }
423
424 /**
425 * shmem_recalc_inode - recalculate the block usage of an inode
426 * @inode: inode to recalc
427 * @alloced: the change in number of pages allocated to inode
428 * @swapped: the change in number of pages swapped from inode
429 *
430 * We have to calculate the free blocks since the mm can drop
431 * undirtied hole pages behind our back.
432 *
433 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped
434 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
435 *
436 * Return: true if swapped was incremented from 0, for shmem_writeout().
437 */
shmem_recalc_inode(struct inode * inode,long alloced,long swapped)438 static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
439 {
440 struct shmem_inode_info *info = SHMEM_I(inode);
441 bool first_swapped = false;
442 long freed;
443
444 spin_lock(&info->lock);
445 info->alloced += alloced;
446 info->swapped += swapped;
447 freed = info->alloced - info->swapped -
448 READ_ONCE(inode->i_mapping->nrpages);
449 /*
450 * Special case: whereas normally shmem_recalc_inode() is called
451 * after i_mapping->nrpages has already been adjusted (up or down),
452 * shmem_writeout() has to raise swapped before nrpages is lowered -
453 * to stop a racing shmem_recalc_inode() from thinking that a page has
454 * been freed. Compensate here, to avoid the need for a followup call.
455 */
456 if (swapped > 0) {
457 if (info->swapped == swapped)
458 first_swapped = true;
459 freed += swapped;
460 }
461 if (freed > 0)
462 info->alloced -= freed;
463 spin_unlock(&info->lock);
464
465 /* The quota case may block */
466 if (freed > 0)
467 shmem_inode_unacct_blocks(inode, freed);
468 return first_swapped;
469 }
470
shmem_charge(struct inode * inode,long pages)471 bool shmem_charge(struct inode *inode, long pages)
472 {
473 struct address_space *mapping = inode->i_mapping;
474
475 if (shmem_inode_acct_blocks(inode, pages))
476 return false;
477
478 /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
479 xa_lock_irq(&mapping->i_pages);
480 mapping->nrpages += pages;
481 xa_unlock_irq(&mapping->i_pages);
482
483 shmem_recalc_inode(inode, pages, 0);
484 return true;
485 }
486
shmem_uncharge(struct inode * inode,long pages)487 void shmem_uncharge(struct inode *inode, long pages)
488 {
489 /* pages argument is currently unused: keep it to help debugging */
490 /* nrpages adjustment done by __filemap_remove_folio() or caller */
491
492 shmem_recalc_inode(inode, 0, 0);
493 }
494
495 /*
496 * Replace item expected in xarray by a new item, while holding xa_lock.
497 */
shmem_replace_entry(struct address_space * mapping,pgoff_t index,void * expected,void * replacement)498 static int shmem_replace_entry(struct address_space *mapping,
499 pgoff_t index, void *expected, void *replacement)
500 {
501 XA_STATE(xas, &mapping->i_pages, index);
502 void *item;
503
504 VM_BUG_ON(!expected);
505 VM_BUG_ON(!replacement);
506 item = xas_load(&xas);
507 if (item != expected)
508 return -ENOENT;
509 xas_store(&xas, replacement);
510 return 0;
511 }
512
513 /*
514 * Sometimes, before we decide whether to proceed or to fail, we must check
515 * that an entry was not already brought back from swap by a racing thread.
516 *
517 * Checking folio is not enough: by the time a swapcache folio is locked, it
518 * might be reused, and again be swapcache, using the same swap as before.
519 */
shmem_confirm_swap(struct address_space * mapping,pgoff_t index,swp_entry_t swap)520 static bool shmem_confirm_swap(struct address_space *mapping,
521 pgoff_t index, swp_entry_t swap)
522 {
523 return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
524 }
525
526 /*
527 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
528 *
529 * SHMEM_HUGE_NEVER:
530 * disables huge pages for the mount;
531 * SHMEM_HUGE_ALWAYS:
532 * enables huge pages for the mount;
533 * SHMEM_HUGE_WITHIN_SIZE:
534 * only allocate huge pages if the page will be fully within i_size,
535 * also respect madvise() hints;
536 * SHMEM_HUGE_ADVISE:
537 * only allocate huge pages if requested with madvise();
538 */
539
540 #define SHMEM_HUGE_NEVER 0
541 #define SHMEM_HUGE_ALWAYS 1
542 #define SHMEM_HUGE_WITHIN_SIZE 2
543 #define SHMEM_HUGE_ADVISE 3
544
545 /*
546 * Special values.
547 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
548 *
549 * SHMEM_HUGE_DENY:
550 * disables huge on shm_mnt and all mounts, for emergency use;
551 * SHMEM_HUGE_FORCE:
552 * enables huge on shm_mnt and all mounts, w/o needing option, for testing;
553 *
554 */
555 #define SHMEM_HUGE_DENY (-1)
556 #define SHMEM_HUGE_FORCE (-2)
557
558 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
559 /* ifdef here to avoid bloating shmem.o when not necessary */
560
561 static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
562 static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER;
563
564 /**
565 * shmem_mapping_size_orders - Get allowable folio orders for the given file size.
566 * @mapping: Target address_space.
567 * @index: The page index.
568 * @write_end: end of a write, could extend inode size.
569 *
570 * This returns huge orders for folios (when supported) based on the file size
571 * which the mapping currently allows at the given index. The index is relevant
572 * due to alignment considerations the mapping might have. The returned order
573 * may be less than the size passed.
574 *
575 * Return: The orders.
576 */
577 static inline unsigned int
shmem_mapping_size_orders(struct address_space * mapping,pgoff_t index,loff_t write_end)578 shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end)
579 {
580 unsigned int order;
581 size_t size;
582
583 if (!mapping_large_folio_support(mapping) || !write_end)
584 return 0;
585
586 /* Calculate the write size based on the write_end */
587 size = write_end - (index << PAGE_SHIFT);
588 order = filemap_get_order(size);
589 if (!order)
590 return 0;
591
592 /* If we're not aligned, allocate a smaller folio */
593 if (index & ((1UL << order) - 1))
594 order = __ffs(index);
595
596 order = min_t(size_t, order, MAX_PAGECACHE_ORDER);
597 return order > 0 ? BIT(order + 1) - 1 : 0;
598 }
599
shmem_get_orders_within_size(struct inode * inode,unsigned long within_size_orders,pgoff_t index,loff_t write_end)600 static unsigned int shmem_get_orders_within_size(struct inode *inode,
601 unsigned long within_size_orders, pgoff_t index,
602 loff_t write_end)
603 {
604 pgoff_t aligned_index;
605 unsigned long order;
606 loff_t i_size;
607
608 order = highest_order(within_size_orders);
609 while (within_size_orders) {
610 aligned_index = round_up(index + 1, 1 << order);
611 i_size = max(write_end, i_size_read(inode));
612 i_size = round_up(i_size, PAGE_SIZE);
613 if (i_size >> PAGE_SHIFT >= aligned_index)
614 return within_size_orders;
615
616 order = next_order(&within_size_orders, order);
617 }
618
619 return 0;
620 }
621
shmem_huge_global_enabled(struct inode * inode,pgoff_t index,loff_t write_end,bool shmem_huge_force,struct vm_area_struct * vma,vm_flags_t vm_flags)622 static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
623 loff_t write_end, bool shmem_huge_force,
624 struct vm_area_struct *vma,
625 vm_flags_t vm_flags)
626 {
627 unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
628 0 : BIT(HPAGE_PMD_ORDER);
629 unsigned long within_size_orders;
630
631 if (!S_ISREG(inode->i_mode))
632 return 0;
633 if (shmem_huge == SHMEM_HUGE_DENY)
634 return 0;
635 if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
636 return maybe_pmd_order;
637
638 /*
639 * The huge order allocation for anon shmem is controlled through
640 * the mTHP interface, so we still use PMD-sized huge order to
641 * check whether global control is enabled.
642 *
643 * For tmpfs mmap()'s huge order, we still use PMD-sized order to
644 * allocate huge pages due to lack of a write size hint.
645 *
646 * Otherwise, tmpfs will allow getting a highest order hint based on
647 * the size of write and fallocate paths, then will try each allowable
648 * huge orders.
649 */
650 switch (SHMEM_SB(inode->i_sb)->huge) {
651 case SHMEM_HUGE_ALWAYS:
652 if (vma)
653 return maybe_pmd_order;
654
655 return shmem_mapping_size_orders(inode->i_mapping, index, write_end);
656 case SHMEM_HUGE_WITHIN_SIZE:
657 if (vma)
658 within_size_orders = maybe_pmd_order;
659 else
660 within_size_orders = shmem_mapping_size_orders(inode->i_mapping,
661 index, write_end);
662
663 within_size_orders = shmem_get_orders_within_size(inode, within_size_orders,
664 index, write_end);
665 if (within_size_orders > 0)
666 return within_size_orders;
667
668 fallthrough;
669 case SHMEM_HUGE_ADVISE:
670 if (vm_flags & VM_HUGEPAGE)
671 return maybe_pmd_order;
672 fallthrough;
673 default:
674 return 0;
675 }
676 }
677
shmem_parse_huge(const char * str)678 static int shmem_parse_huge(const char *str)
679 {
680 int huge;
681
682 if (!str)
683 return -EINVAL;
684
685 if (!strcmp(str, "never"))
686 huge = SHMEM_HUGE_NEVER;
687 else if (!strcmp(str, "always"))
688 huge = SHMEM_HUGE_ALWAYS;
689 else if (!strcmp(str, "within_size"))
690 huge = SHMEM_HUGE_WITHIN_SIZE;
691 else if (!strcmp(str, "advise"))
692 huge = SHMEM_HUGE_ADVISE;
693 else if (!strcmp(str, "deny"))
694 huge = SHMEM_HUGE_DENY;
695 else if (!strcmp(str, "force"))
696 huge = SHMEM_HUGE_FORCE;
697 else
698 return -EINVAL;
699
700 if (!has_transparent_hugepage() &&
701 huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
702 return -EINVAL;
703
704 /* Do not override huge allocation policy with non-PMD sized mTHP */
705 if (huge == SHMEM_HUGE_FORCE &&
706 huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
707 return -EINVAL;
708
709 return huge;
710 }
711
712 #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
shmem_format_huge(int huge)713 static const char *shmem_format_huge(int huge)
714 {
715 switch (huge) {
716 case SHMEM_HUGE_NEVER:
717 return "never";
718 case SHMEM_HUGE_ALWAYS:
719 return "always";
720 case SHMEM_HUGE_WITHIN_SIZE:
721 return "within_size";
722 case SHMEM_HUGE_ADVISE:
723 return "advise";
724 case SHMEM_HUGE_DENY:
725 return "deny";
726 case SHMEM_HUGE_FORCE:
727 return "force";
728 default:
729 VM_BUG_ON(1);
730 return "bad_val";
731 }
732 }
733 #endif
734
shmem_unused_huge_shrink(struct shmem_sb_info * sbinfo,struct shrink_control * sc,unsigned long nr_to_free)735 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
736 struct shrink_control *sc, unsigned long nr_to_free)
737 {
738 LIST_HEAD(list), *pos, *next;
739 struct inode *inode;
740 struct shmem_inode_info *info;
741 struct folio *folio;
742 unsigned long batch = sc ? sc->nr_to_scan : 128;
743 unsigned long split = 0, freed = 0;
744
745 if (list_empty(&sbinfo->shrinklist))
746 return SHRINK_STOP;
747
748 spin_lock(&sbinfo->shrinklist_lock);
749 list_for_each_safe(pos, next, &sbinfo->shrinklist) {
750 info = list_entry(pos, struct shmem_inode_info, shrinklist);
751
752 /* pin the inode */
753 inode = igrab(&info->vfs_inode);
754
755 /* inode is about to be evicted */
756 if (!inode) {
757 list_del_init(&info->shrinklist);
758 goto next;
759 }
760
761 list_move(&info->shrinklist, &list);
762 next:
763 sbinfo->shrinklist_len--;
764 if (!--batch)
765 break;
766 }
767 spin_unlock(&sbinfo->shrinklist_lock);
768
769 list_for_each_safe(pos, next, &list) {
770 pgoff_t next, end;
771 loff_t i_size;
772 int ret;
773
774 info = list_entry(pos, struct shmem_inode_info, shrinklist);
775 inode = &info->vfs_inode;
776
777 if (nr_to_free && freed >= nr_to_free)
778 goto move_back;
779
780 i_size = i_size_read(inode);
781 folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE);
782 if (!folio || xa_is_value(folio))
783 goto drop;
784
785 /* No large folio at the end of the file: nothing to split */
786 if (!folio_test_large(folio)) {
787 folio_put(folio);
788 goto drop;
789 }
790
791 /* Check if there is anything to gain from splitting */
792 next = folio_next_index(folio);
793 end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE));
794 if (end <= folio->index || end >= next) {
795 folio_put(folio);
796 goto drop;
797 }
798
799 /*
800 * Move the inode on the list back to shrinklist if we failed
801 * to lock the page at this time.
802 *
803 * Waiting for the lock may lead to deadlock in the
804 * reclaim path.
805 */
806 if (!folio_trylock(folio)) {
807 folio_put(folio);
808 goto move_back;
809 }
810
811 ret = split_folio(folio);
812 folio_unlock(folio);
813 folio_put(folio);
814
815 /* If split failed move the inode on the list back to shrinklist */
816 if (ret)
817 goto move_back;
818
819 freed += next - end;
820 split++;
821 drop:
822 list_del_init(&info->shrinklist);
823 goto put;
824 move_back:
825 /*
826 * Make sure the inode is either on the global list or deleted
827 * from any local list before iput() since it could be deleted
828 * in another thread once we put the inode (then the local list
829 * is corrupted).
830 */
831 spin_lock(&sbinfo->shrinklist_lock);
832 list_move(&info->shrinklist, &sbinfo->shrinklist);
833 sbinfo->shrinklist_len++;
834 spin_unlock(&sbinfo->shrinklist_lock);
835 put:
836 iput(inode);
837 }
838
839 return split;
840 }
841
shmem_unused_huge_scan(struct super_block * sb,struct shrink_control * sc)842 static long shmem_unused_huge_scan(struct super_block *sb,
843 struct shrink_control *sc)
844 {
845 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
846
847 if (!READ_ONCE(sbinfo->shrinklist_len))
848 return SHRINK_STOP;
849
850 return shmem_unused_huge_shrink(sbinfo, sc, 0);
851 }
852
shmem_unused_huge_count(struct super_block * sb,struct shrink_control * sc)853 static long shmem_unused_huge_count(struct super_block *sb,
854 struct shrink_control *sc)
855 {
856 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
857 return READ_ONCE(sbinfo->shrinklist_len);
858 }
859 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
860
861 #define shmem_huge SHMEM_HUGE_DENY
862
shmem_unused_huge_shrink(struct shmem_sb_info * sbinfo,struct shrink_control * sc,unsigned long nr_to_free)863 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
864 struct shrink_control *sc, unsigned long nr_to_free)
865 {
866 return 0;
867 }
868
shmem_huge_global_enabled(struct inode * inode,pgoff_t index,loff_t write_end,bool shmem_huge_force,struct vm_area_struct * vma,vm_flags_t vm_flags)869 static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
870 loff_t write_end, bool shmem_huge_force,
871 struct vm_area_struct *vma,
872 vm_flags_t vm_flags)
873 {
874 return 0;
875 }
876 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
877
shmem_update_stats(struct folio * folio,int nr_pages)878 static void shmem_update_stats(struct folio *folio, int nr_pages)
879 {
880 if (folio_test_pmd_mappable(folio))
881 __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
882 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
883 __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
884 }
885
886 /*
887 * Somewhat like filemap_add_folio, but error if expected item has gone.
888 */
shmem_add_to_page_cache(struct folio * folio,struct address_space * mapping,pgoff_t index,void * expected,gfp_t gfp)889 static int shmem_add_to_page_cache(struct folio *folio,
890 struct address_space *mapping,
891 pgoff_t index, void *expected, gfp_t gfp)
892 {
893 XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
894 long nr = folio_nr_pages(folio);
895
896 VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
897 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
898 VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
899
900 folio_ref_add(folio, nr);
901 folio->mapping = mapping;
902 folio->index = index;
903
904 gfp &= GFP_RECLAIM_MASK;
905 folio_throttle_swaprate(folio, gfp);
906
907 do {
908 xas_lock_irq(&xas);
909 if (expected != xas_find_conflict(&xas)) {
910 xas_set_err(&xas, -EEXIST);
911 goto unlock;
912 }
913 if (expected && xas_find_conflict(&xas)) {
914 xas_set_err(&xas, -EEXIST);
915 goto unlock;
916 }
917 xas_store(&xas, folio);
918 if (xas_error(&xas))
919 goto unlock;
920 shmem_update_stats(folio, nr);
921 mapping->nrpages += nr;
922 unlock:
923 xas_unlock_irq(&xas);
924 } while (xas_nomem(&xas, gfp));
925
926 if (xas_error(&xas)) {
927 folio->mapping = NULL;
928 folio_ref_sub(folio, nr);
929 return xas_error(&xas);
930 }
931
932 return 0;
933 }
934
935 /*
936 * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
937 */
shmem_delete_from_page_cache(struct folio * folio,void * radswap)938 static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
939 {
940 struct address_space *mapping = folio->mapping;
941 long nr = folio_nr_pages(folio);
942 int error;
943
944 xa_lock_irq(&mapping->i_pages);
945 error = shmem_replace_entry(mapping, folio->index, folio, radswap);
946 folio->mapping = NULL;
947 mapping->nrpages -= nr;
948 shmem_update_stats(folio, -nr);
949 xa_unlock_irq(&mapping->i_pages);
950 folio_put_refs(folio, nr);
951 BUG_ON(error);
952 }
953
954 /*
955 * Remove swap entry from page cache, free the swap and its page cache. Returns
956 * the number of pages being freed. 0 means entry not found in XArray (0 pages
957 * being freed).
958 */
shmem_free_swap(struct address_space * mapping,pgoff_t index,void * radswap)959 static long shmem_free_swap(struct address_space *mapping,
960 pgoff_t index, void *radswap)
961 {
962 int order = xa_get_order(&mapping->i_pages, index);
963 void *old;
964
965 old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
966 if (old != radswap)
967 return 0;
968 free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);
969
970 return 1 << order;
971 }
972
973 /*
974 * Determine (in bytes) how many of the shmem object's pages mapped by the
975 * given offsets are swapped out.
976 *
977 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
978 * as long as the inode doesn't go away and racy results are not a problem.
979 */
shmem_partial_swap_usage(struct address_space * mapping,pgoff_t start,pgoff_t end)980 unsigned long shmem_partial_swap_usage(struct address_space *mapping,
981 pgoff_t start, pgoff_t end)
982 {
983 XA_STATE(xas, &mapping->i_pages, start);
984 struct page *page;
985 unsigned long swapped = 0;
986 unsigned long max = end - 1;
987
988 rcu_read_lock();
989 xas_for_each(&xas, page, max) {
990 if (xas_retry(&xas, page))
991 continue;
992 if (xa_is_value(page))
993 swapped += 1 << xas_get_order(&xas);
994 if (xas.xa_index == max)
995 break;
996 if (need_resched()) {
997 xas_pause(&xas);
998 cond_resched_rcu();
999 }
1000 }
1001 rcu_read_unlock();
1002
1003 return swapped << PAGE_SHIFT;
1004 }
1005
1006 /*
1007 * Determine (in bytes) how many of the shmem object's pages mapped by the
1008 * given vma is swapped out.
1009 *
1010 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
1011 * as long as the inode doesn't go away and racy results are not a problem.
1012 */
shmem_swap_usage(struct vm_area_struct * vma)1013 unsigned long shmem_swap_usage(struct vm_area_struct *vma)
1014 {
1015 struct inode *inode = file_inode(vma->vm_file);
1016 struct shmem_inode_info *info = SHMEM_I(inode);
1017 struct address_space *mapping = inode->i_mapping;
1018 unsigned long swapped;
1019
1020 /* Be careful as we don't hold info->lock */
1021 swapped = READ_ONCE(info->swapped);
1022
1023 /*
1024 * The easier cases are when the shmem object has nothing in swap, or
1025 * the vma maps it whole. Then we can simply use the stats that we
1026 * already track.
1027 */
1028 if (!swapped)
1029 return 0;
1030
1031 if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
1032 return swapped << PAGE_SHIFT;
1033
1034 /* Here comes the more involved part */
1035 return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
1036 vma->vm_pgoff + vma_pages(vma));
1037 }
1038
1039 /*
1040 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
1041 */
shmem_unlock_mapping(struct address_space * mapping)1042 void shmem_unlock_mapping(struct address_space *mapping)
1043 {
1044 struct folio_batch fbatch;
1045 pgoff_t index = 0;
1046
1047 folio_batch_init(&fbatch);
1048 /*
1049 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
1050 */
1051 while (!mapping_unevictable(mapping) &&
1052 filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
1053 check_move_unevictable_folios(&fbatch);
1054 folio_batch_release(&fbatch);
1055 cond_resched();
1056 }
1057 }
1058
shmem_get_partial_folio(struct inode * inode,pgoff_t index)1059 static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
1060 {
1061 struct folio *folio;
1062
1063 /*
1064 * At first avoid shmem_get_folio(,,,SGP_READ): that fails
1065 * beyond i_size, and reports fallocated folios as holes.
1066 */
1067 folio = filemap_get_entry(inode->i_mapping, index);
1068 if (!folio)
1069 return folio;
1070 if (!xa_is_value(folio)) {
1071 folio_lock(folio);
1072 if (folio->mapping == inode->i_mapping)
1073 return folio;
1074 /* The folio has been swapped out */
1075 folio_unlock(folio);
1076 folio_put(folio);
1077 }
1078 /*
1079 * But read a folio back from swap if any of it is within i_size
1080 * (although in some cases this is just a waste of time).
1081 */
1082 folio = NULL;
1083 shmem_get_folio(inode, index, 0, &folio, SGP_READ);
1084 return folio;
1085 }
1086
1087 /*
1088 * Remove range of pages and swap entries from page cache, and free them.
1089 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
1090 */
shmem_undo_range(struct inode * inode,loff_t lstart,loff_t lend,bool unfalloc)1091 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
1092 bool unfalloc)
1093 {
1094 struct address_space *mapping = inode->i_mapping;
1095 struct shmem_inode_info *info = SHMEM_I(inode);
1096 pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
1097 pgoff_t end = (lend + 1) >> PAGE_SHIFT;
1098 struct folio_batch fbatch;
1099 pgoff_t indices[PAGEVEC_SIZE];
1100 struct folio *folio;
1101 bool same_folio;
1102 long nr_swaps_freed = 0;
1103 pgoff_t index;
1104 int i;
1105
1106 if (lend == -1)
1107 end = -1; /* unsigned, so actually very big */
1108
1109 if (info->fallocend > start && info->fallocend <= end && !unfalloc)
1110 info->fallocend = start;
1111
1112 folio_batch_init(&fbatch);
1113 index = start;
1114 while (index < end && find_lock_entries(mapping, &index, end - 1,
1115 &fbatch, indices)) {
1116 for (i = 0; i < folio_batch_count(&fbatch); i++) {
1117 folio = fbatch.folios[i];
1118
1119 if (xa_is_value(folio)) {
1120 if (unfalloc)
1121 continue;
1122 nr_swaps_freed += shmem_free_swap(mapping,
1123 indices[i], folio);
1124 continue;
1125 }
1126
1127 if (!unfalloc || !folio_test_uptodate(folio))
1128 truncate_inode_folio(mapping, folio);
1129 folio_unlock(folio);
1130 }
1131 folio_batch_remove_exceptionals(&fbatch);
1132 folio_batch_release(&fbatch);
1133 cond_resched();
1134 }
1135
1136 /*
1137 * When undoing a failed fallocate, we want none of the partial folio
1138 * zeroing and splitting below, but shall want to truncate the whole
1139 * folio when !uptodate indicates that it was added by this fallocate,
1140 * even when [lstart, lend] covers only a part of the folio.
1141 */
1142 if (unfalloc)
1143 goto whole_folios;
1144
1145 same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
1146 folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
1147 if (folio) {
1148 same_folio = lend < folio_pos(folio) + folio_size(folio);
1149 folio_mark_dirty(folio);
1150 if (!truncate_inode_partial_folio(folio, lstart, lend)) {
1151 start = folio_next_index(folio);
1152 if (same_folio)
1153 end = folio->index;
1154 }
1155 folio_unlock(folio);
1156 folio_put(folio);
1157 folio = NULL;
1158 }
1159
1160 if (!same_folio)
1161 folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
1162 if (folio) {
1163 folio_mark_dirty(folio);
1164 if (!truncate_inode_partial_folio(folio, lstart, lend))
1165 end = folio->index;
1166 folio_unlock(folio);
1167 folio_put(folio);
1168 }
1169
1170 whole_folios:
1171
1172 index = start;
1173 while (index < end) {
1174 cond_resched();
1175
1176 if (!find_get_entries(mapping, &index, end - 1, &fbatch,
1177 indices)) {
1178 /* If all gone or hole-punch or unfalloc, we're done */
1179 if (index == start || end != -1)
1180 break;
1181 /* But if truncating, restart to make sure all gone */
1182 index = start;
1183 continue;
1184 }
1185 for (i = 0; i < folio_batch_count(&fbatch); i++) {
1186 folio = fbatch.folios[i];
1187
1188 if (xa_is_value(folio)) {
1189 long swaps_freed;
1190
1191 if (unfalloc)
1192 continue;
1193 swaps_freed = shmem_free_swap(mapping, indices[i], folio);
1194 if (!swaps_freed) {
1195 /* Swap was replaced by page: retry */
1196 index = indices[i];
1197 break;
1198 }
1199 nr_swaps_freed += swaps_freed;
1200 continue;
1201 }
1202
1203 folio_lock(folio);
1204
1205 if (!unfalloc || !folio_test_uptodate(folio)) {
1206 if (folio_mapping(folio) != mapping) {
1207 /* Page was replaced by swap: retry */
1208 folio_unlock(folio);
1209 index = indices[i];
1210 break;
1211 }
1212 VM_BUG_ON_FOLIO(folio_test_writeback(folio),
1213 folio);
1214
1215 if (!folio_test_large(folio)) {
1216 truncate_inode_folio(mapping, folio);
1217 } else if (truncate_inode_partial_folio(folio, lstart, lend)) {
1218 /*
1219 * If we split a page, reset the loop so
1220 * that we pick up the new sub pages.
1221 * Otherwise the THP was entirely
1222 * dropped or the target range was
1223 * zeroed, so just continue the loop as
1224 * is.
1225 */
1226 if (!folio_test_large(folio)) {
1227 folio_unlock(folio);
1228 index = start;
1229 break;
1230 }
1231 }
1232 }
1233 folio_unlock(folio);
1234 }
1235 folio_batch_remove_exceptionals(&fbatch);
1236 folio_batch_release(&fbatch);
1237 }
1238
1239 shmem_recalc_inode(inode, 0, -nr_swaps_freed);
1240 }
1241
shmem_truncate_range(struct inode * inode,loff_t lstart,loff_t lend)1242 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
1243 {
1244 shmem_undo_range(inode, lstart, lend, false);
1245 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
1246 inode_inc_iversion(inode);
1247 }
1248 EXPORT_SYMBOL_GPL(shmem_truncate_range);
1249
shmem_getattr(struct mnt_idmap * idmap,const struct path * path,struct kstat * stat,u32 request_mask,unsigned int query_flags)1250 static int shmem_getattr(struct mnt_idmap *idmap,
1251 const struct path *path, struct kstat *stat,
1252 u32 request_mask, unsigned int query_flags)
1253 {
1254 struct inode *inode = path->dentry->d_inode;
1255 struct shmem_inode_info *info = SHMEM_I(inode);
1256
1257 if (info->alloced - info->swapped != inode->i_mapping->nrpages)
1258 shmem_recalc_inode(inode, 0, 0);
1259
1260 if (info->fsflags & FS_APPEND_FL)
1261 stat->attributes |= STATX_ATTR_APPEND;
1262 if (info->fsflags & FS_IMMUTABLE_FL)
1263 stat->attributes |= STATX_ATTR_IMMUTABLE;
1264 if (info->fsflags & FS_NODUMP_FL)
1265 stat->attributes |= STATX_ATTR_NODUMP;
1266 stat->attributes_mask |= (STATX_ATTR_APPEND |
1267 STATX_ATTR_IMMUTABLE |
1268 STATX_ATTR_NODUMP);
1269 generic_fillattr(idmap, request_mask, inode, stat);
1270
1271 if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
1272 stat->blksize = HPAGE_PMD_SIZE;
1273
1274 if (request_mask & STATX_BTIME) {
1275 stat->result_mask |= STATX_BTIME;
1276 stat->btime.tv_sec = info->i_crtime.tv_sec;
1277 stat->btime.tv_nsec = info->i_crtime.tv_nsec;
1278 }
1279
1280 return 0;
1281 }
1282
shmem_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)1283 static int shmem_setattr(struct mnt_idmap *idmap,
1284 struct dentry *dentry, struct iattr *attr)
1285 {
1286 struct inode *inode = d_inode(dentry);
1287 struct shmem_inode_info *info = SHMEM_I(inode);
1288 int error;
1289 bool update_mtime = false;
1290 bool update_ctime = true;
1291
1292 error = setattr_prepare(idmap, dentry, attr);
1293 if (error)
1294 return error;
1295
1296 if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
1297 if ((inode->i_mode ^ attr->ia_mode) & 0111) {
1298 return -EPERM;
1299 }
1300 }
1301
1302 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1303 loff_t oldsize = inode->i_size;
1304 loff_t newsize = attr->ia_size;
1305
1306 /* protected by i_rwsem */
1307 if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
1308 (newsize > oldsize && (info->seals & F_SEAL_GROW)))
1309 return -EPERM;
1310
1311 if (newsize != oldsize) {
1312 error = shmem_reacct_size(SHMEM_I(inode)->flags,
1313 oldsize, newsize);
1314 if (error)
1315 return error;
1316 i_size_write(inode, newsize);
1317 update_mtime = true;
1318 } else {
1319 update_ctime = false;
1320 }
1321 if (newsize <= oldsize) {
1322 loff_t holebegin = round_up(newsize, PAGE_SIZE);
1323 if (oldsize > holebegin)
1324 unmap_mapping_range(inode->i_mapping,
1325 holebegin, 0, 1);
1326 if (info->alloced)
1327 shmem_truncate_range(inode,
1328 newsize, (loff_t)-1);
1329 /* unmap again to remove racily COWed private pages */
1330 if (oldsize > holebegin)
1331 unmap_mapping_range(inode->i_mapping,
1332 holebegin, 0, 1);
1333 }
1334 }
1335
1336 if (is_quota_modification(idmap, inode, attr)) {
1337 error = dquot_initialize(inode);
1338 if (error)
1339 return error;
1340 }
1341
1342 /* Transfer quota accounting */
1343 if (i_uid_needs_update(idmap, attr, inode) ||
1344 i_gid_needs_update(idmap, attr, inode)) {
1345 error = dquot_transfer(idmap, inode, attr);
1346 if (error)
1347 return error;
1348 }
1349
1350 setattr_copy(idmap, inode, attr);
1351 if (attr->ia_valid & ATTR_MODE)
1352 error = posix_acl_chmod(idmap, dentry, inode->i_mode);
1353 if (!error && update_ctime) {
1354 inode_set_ctime_current(inode);
1355 if (update_mtime)
1356 inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
1357 inode_inc_iversion(inode);
1358 }
1359 return error;
1360 }
1361
shmem_evict_inode(struct inode * inode)1362 static void shmem_evict_inode(struct inode *inode)
1363 {
1364 struct shmem_inode_info *info = SHMEM_I(inode);
1365 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1366 size_t freed = 0;
1367
1368 if (shmem_mapping(inode->i_mapping)) {
1369 shmem_unacct_size(info->flags, inode->i_size);
1370 inode->i_size = 0;
1371 mapping_set_exiting(inode->i_mapping);
1372 shmem_truncate_range(inode, 0, (loff_t)-1);
1373 if (!list_empty(&info->shrinklist)) {
1374 spin_lock(&sbinfo->shrinklist_lock);
1375 if (!list_empty(&info->shrinklist)) {
1376 list_del_init(&info->shrinklist);
1377 sbinfo->shrinklist_len--;
1378 }
1379 spin_unlock(&sbinfo->shrinklist_lock);
1380 }
1381 while (!list_empty(&info->swaplist)) {
1382 /* Wait while shmem_unuse() is scanning this inode... */
1383 wait_var_event(&info->stop_eviction,
1384 !atomic_read(&info->stop_eviction));
1385 spin_lock(&shmem_swaplist_lock);
1386 /* ...but beware of the race if we peeked too early */
1387 if (!atomic_read(&info->stop_eviction))
1388 list_del_init(&info->swaplist);
1389 spin_unlock(&shmem_swaplist_lock);
1390 }
1391 }
1392
1393 simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL);
1394 shmem_free_inode(inode->i_sb, freed);
1395 WARN_ON(inode->i_blocks);
1396 clear_inode(inode);
1397 #ifdef CONFIG_TMPFS_QUOTA
1398 dquot_free_inode(inode);
1399 dquot_drop(inode);
1400 #endif
1401 }
1402
shmem_find_swap_entries(struct address_space * mapping,pgoff_t start,struct folio_batch * fbatch,pgoff_t * indices,unsigned int type)1403 static unsigned int shmem_find_swap_entries(struct address_space *mapping,
1404 pgoff_t start, struct folio_batch *fbatch,
1405 pgoff_t *indices, unsigned int type)
1406 {
1407 XA_STATE(xas, &mapping->i_pages, start);
1408 struct folio *folio;
1409 swp_entry_t entry;
1410
1411 rcu_read_lock();
1412 xas_for_each(&xas, folio, ULONG_MAX) {
1413 if (xas_retry(&xas, folio))
1414 continue;
1415
1416 if (!xa_is_value(folio))
1417 continue;
1418
1419 entry = radix_to_swp_entry(folio);
1420 /*
1421 * swapin error entries can be found in the mapping. But they're
1422 * deliberately ignored here as we've done everything we can do.
1423 */
1424 if (swp_type(entry) != type)
1425 continue;
1426
1427 indices[folio_batch_count(fbatch)] = xas.xa_index;
1428 if (!folio_batch_add(fbatch, folio))
1429 break;
1430
1431 if (need_resched()) {
1432 xas_pause(&xas);
1433 cond_resched_rcu();
1434 }
1435 }
1436 rcu_read_unlock();
1437
1438 return folio_batch_count(fbatch);
1439 }
1440
1441 /*
1442 * Move the swapped pages for an inode to page cache. Returns the count
1443 * of pages swapped in, or the error in case of failure.
1444 */
shmem_unuse_swap_entries(struct inode * inode,struct folio_batch * fbatch,pgoff_t * indices)1445 static int shmem_unuse_swap_entries(struct inode *inode,
1446 struct folio_batch *fbatch, pgoff_t *indices)
1447 {
1448 int i = 0;
1449 int ret = 0;
1450 int error = 0;
1451 struct address_space *mapping = inode->i_mapping;
1452
1453 for (i = 0; i < folio_batch_count(fbatch); i++) {
1454 struct folio *folio = fbatch->folios[i];
1455
1456 error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
1457 mapping_gfp_mask(mapping), NULL, NULL);
1458 if (error == 0) {
1459 folio_unlock(folio);
1460 folio_put(folio);
1461 ret++;
1462 }
1463 if (error == -ENOMEM)
1464 break;
1465 error = 0;
1466 }
1467 return error ? error : ret;
1468 }
1469
1470 /*
1471 * If swap found in inode, free it and move page from swapcache to filecache.
1472 */
shmem_unuse_inode(struct inode * inode,unsigned int type)1473 static int shmem_unuse_inode(struct inode *inode, unsigned int type)
1474 {
1475 struct address_space *mapping = inode->i_mapping;
1476 pgoff_t start = 0;
1477 struct folio_batch fbatch;
1478 pgoff_t indices[PAGEVEC_SIZE];
1479 int ret = 0;
1480
1481 do {
1482 folio_batch_init(&fbatch);
1483 if (!shmem_find_swap_entries(mapping, start, &fbatch,
1484 indices, type)) {
1485 ret = 0;
1486 break;
1487 }
1488
1489 ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
1490 if (ret < 0)
1491 break;
1492
1493 start = indices[folio_batch_count(&fbatch) - 1];
1494 } while (true);
1495
1496 return ret;
1497 }
1498
1499 /*
1500 * Read all the shared memory data that resides in the swap
1501 * device 'type' back into memory, so the swap device can be
1502 * unused.
1503 */
shmem_unuse(unsigned int type)1504 int shmem_unuse(unsigned int type)
1505 {
1506 struct shmem_inode_info *info, *next;
1507 int error = 0;
1508
1509 if (list_empty(&shmem_swaplist))
1510 return 0;
1511
1512 spin_lock(&shmem_swaplist_lock);
1513 start_over:
1514 list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1515 if (!info->swapped) {
1516 list_del_init(&info->swaplist);
1517 continue;
1518 }
1519 /*
1520 * Drop the swaplist mutex while searching the inode for swap;
1521 * but before doing so, make sure shmem_evict_inode() will not
1522 * remove placeholder inode from swaplist, nor let it be freed
1523 * (igrab() would protect from unlink, but not from unmount).
1524 */
1525 atomic_inc(&info->stop_eviction);
1526 spin_unlock(&shmem_swaplist_lock);
1527
1528 error = shmem_unuse_inode(&info->vfs_inode, type);
1529 cond_resched();
1530
1531 spin_lock(&shmem_swaplist_lock);
1532 if (atomic_dec_and_test(&info->stop_eviction))
1533 wake_up_var(&info->stop_eviction);
1534 if (error)
1535 break;
1536 if (list_empty(&info->swaplist))
1537 goto start_over;
1538 next = list_next_entry(info, swaplist);
1539 if (!info->swapped)
1540 list_del_init(&info->swaplist);
1541 }
1542 spin_unlock(&shmem_swaplist_lock);
1543
1544 return error;
1545 }
1546
1547 /**
1548 * shmem_writeout - Write the folio to swap
1549 * @folio: The folio to write
1550 * @plug: swap plug
1551 * @folio_list: list to put back folios on split
1552 *
1553 * Move the folio from the page cache to the swap cache.
1554 */
shmem_writeout(struct folio * folio,struct swap_iocb ** plug,struct list_head * folio_list)1555 int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
1556 struct list_head *folio_list)
1557 {
1558 struct address_space *mapping = folio->mapping;
1559 struct inode *inode = mapping->host;
1560 struct shmem_inode_info *info = SHMEM_I(inode);
1561 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1562 pgoff_t index;
1563 int nr_pages;
1564 bool split = false;
1565
1566 if ((info->flags & VM_LOCKED) || sbinfo->noswap)
1567 goto redirty;
1568
1569 if (!total_swap_pages)
1570 goto redirty;
1571
1572 /*
1573 * If CONFIG_THP_SWAP is not enabled, the large folio should be
1574 * split when swapping.
1575 *
1576 * And shrinkage of pages beyond i_size does not split swap, so
1577 * swapout of a large folio crossing i_size needs to split too
1578 * (unless fallocate has been used to preallocate beyond EOF).
1579 */
1580 if (folio_test_large(folio)) {
1581 index = shmem_fallocend(inode,
1582 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
1583 if ((index > folio->index && index < folio_next_index(folio)) ||
1584 !IS_ENABLED(CONFIG_THP_SWAP))
1585 split = true;
1586 }
1587
1588 if (split) {
1589 try_split:
1590 /* Ensure the subpages are still dirty */
1591 folio_test_set_dirty(folio);
1592 if (split_folio_to_list(folio, folio_list))
1593 goto redirty;
1594 folio_clear_dirty(folio);
1595 }
1596
1597 index = folio->index;
1598 nr_pages = folio_nr_pages(folio);
1599
1600 /*
1601 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1602 * value into swapfile.c, the only way we can correctly account for a
1603 * fallocated folio arriving here is now to initialize it and write it.
1604 *
1605 * That's okay for a folio already fallocated earlier, but if we have
1606 * not yet completed the fallocation, then (a) we want to keep track
1607 * of this folio in case we have to undo it, and (b) it may not be a
1608 * good idea to continue anyway, once we're pushing into swap. So
1609 * reactivate the folio, and let shmem_fallocate() quit when too many.
1610 */
1611 if (!folio_test_uptodate(folio)) {
1612 if (inode->i_private) {
1613 struct shmem_falloc *shmem_falloc;
1614 spin_lock(&inode->i_lock);
1615 shmem_falloc = inode->i_private;
1616 if (shmem_falloc &&
1617 !shmem_falloc->waitq &&
1618 index >= shmem_falloc->start &&
1619 index < shmem_falloc->next)
1620 shmem_falloc->nr_unswapped += nr_pages;
1621 else
1622 shmem_falloc = NULL;
1623 spin_unlock(&inode->i_lock);
1624 if (shmem_falloc)
1625 goto redirty;
1626 }
1627 folio_zero_range(folio, 0, folio_size(folio));
1628 flush_dcache_folio(folio);
1629 folio_mark_uptodate(folio);
1630 }
1631
1632 if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
1633 bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages);
1634 int error;
1635
1636 /*
1637 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1638 * if it's not already there. Do it now before the folio is
1639 * removed from page cache, when its pagelock no longer
1640 * protects the inode from eviction. And do it now, after
1641 * we've incremented swapped, because shmem_unuse() will
1642 * prune a !swapped inode from the swaplist.
1643 */
1644 if (first_swapped) {
1645 spin_lock(&shmem_swaplist_lock);
1646 if (list_empty(&info->swaplist))
1647 list_add(&info->swaplist, &shmem_swaplist);
1648 spin_unlock(&shmem_swaplist_lock);
1649 }
1650
1651 swap_shmem_alloc(folio->swap, nr_pages);
1652 shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
1653
1654 BUG_ON(folio_mapped(folio));
1655 error = swap_writeout(folio, plug);
1656 if (error != AOP_WRITEPAGE_ACTIVATE) {
1657 /* folio has been unlocked */
1658 return error;
1659 }
1660
1661 /*
1662 * The intention here is to avoid holding on to the swap when
1663 * zswap was unable to compress and unable to writeback; but
1664 * it will be appropriate if other reactivate cases are added.
1665 */
1666 error = shmem_add_to_page_cache(folio, mapping, index,
1667 swp_to_radix_entry(folio->swap),
1668 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
1669 /* Swap entry might be erased by racing shmem_free_swap() */
1670 if (!error) {
1671 shmem_recalc_inode(inode, 0, -nr_pages);
1672 swap_free_nr(folio->swap, nr_pages);
1673 }
1674
1675 /*
1676 * The delete_from_swap_cache() below could be left for
1677 * shrink_folio_list()'s folio_free_swap() to dispose of;
1678 * but I'm a little nervous about letting this folio out of
1679 * shmem_writeout() in a hybrid half-tmpfs-half-swap state
1680 * e.g. folio_mapping(folio) might give an unexpected answer.
1681 */
1682 delete_from_swap_cache(folio);
1683 goto redirty;
1684 }
1685 if (nr_pages > 1)
1686 goto try_split;
1687 redirty:
1688 folio_mark_dirty(folio);
1689 return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
1690 }
1691 EXPORT_SYMBOL_GPL(shmem_writeout);
1692
1693 #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
shmem_show_mpol(struct seq_file * seq,struct mempolicy * mpol)1694 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1695 {
1696 char buffer[64];
1697
1698 if (!mpol || mpol->mode == MPOL_DEFAULT)
1699 return; /* show nothing */
1700
1701 mpol_to_str(buffer, sizeof(buffer), mpol);
1702
1703 seq_printf(seq, ",mpol=%s", buffer);
1704 }
1705
shmem_get_sbmpol(struct shmem_sb_info * sbinfo)1706 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1707 {
1708 struct mempolicy *mpol = NULL;
1709 if (sbinfo->mpol) {
1710 raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
1711 mpol = sbinfo->mpol;
1712 mpol_get(mpol);
1713 raw_spin_unlock(&sbinfo->stat_lock);
1714 }
1715 return mpol;
1716 }
1717 #else /* !CONFIG_NUMA || !CONFIG_TMPFS */
shmem_show_mpol(struct seq_file * seq,struct mempolicy * mpol)1718 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1719 {
1720 }
shmem_get_sbmpol(struct shmem_sb_info * sbinfo)1721 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1722 {
1723 return NULL;
1724 }
1725 #endif /* CONFIG_NUMA && CONFIG_TMPFS */
1726
1727 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
1728 pgoff_t index, unsigned int order, pgoff_t *ilx);
1729
shmem_swapin_cluster(swp_entry_t swap,gfp_t gfp,struct shmem_inode_info * info,pgoff_t index)1730 static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
1731 struct shmem_inode_info *info, pgoff_t index)
1732 {
1733 struct mempolicy *mpol;
1734 pgoff_t ilx;
1735 struct folio *folio;
1736
1737 mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
1738 folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
1739 mpol_cond_put(mpol);
1740
1741 return folio;
1742 }
1743
1744 /*
1745 * Make sure huge_gfp is always more limited than limit_gfp.
1746 * Some of the flags set permissions, while others set limitations.
1747 */
limit_gfp_mask(gfp_t huge_gfp,gfp_t limit_gfp)1748 static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
1749 {
1750 gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
1751 gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
1752 gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
1753 gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
1754
1755 /* Allow allocations only from the originally specified zones. */
1756 result |= zoneflags;
1757
1758 /*
1759 * Minimize the result gfp by taking the union with the deny flags,
1760 * and the intersection of the allow flags.
1761 */
1762 result |= (limit_gfp & denyflags);
1763 result |= (huge_gfp & limit_gfp) & allowflags;
1764
1765 return result;
1766 }
1767
1768 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
shmem_hpage_pmd_enabled(void)1769 bool shmem_hpage_pmd_enabled(void)
1770 {
1771 if (shmem_huge == SHMEM_HUGE_DENY)
1772 return false;
1773 if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always))
1774 return true;
1775 if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise))
1776 return true;
1777 if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size))
1778 return true;
1779 if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) &&
1780 shmem_huge != SHMEM_HUGE_NEVER)
1781 return true;
1782
1783 return false;
1784 }
1785
shmem_allowable_huge_orders(struct inode * inode,struct vm_area_struct * vma,pgoff_t index,loff_t write_end,bool shmem_huge_force)1786 unsigned long shmem_allowable_huge_orders(struct inode *inode,
1787 struct vm_area_struct *vma, pgoff_t index,
1788 loff_t write_end, bool shmem_huge_force)
1789 {
1790 unsigned long mask = READ_ONCE(huge_shmem_orders_always);
1791 unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
1792 vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
1793 unsigned int global_orders;
1794
1795 if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
1796 return 0;
1797
1798 global_orders = shmem_huge_global_enabled(inode, index, write_end,
1799 shmem_huge_force, vma, vm_flags);
1800 /* Tmpfs huge pages allocation */
1801 if (!vma || !vma_is_anon_shmem(vma))
1802 return global_orders;
1803
1804 /*
1805 * Following the 'deny' semantics of the top level, force the huge
1806 * option off from all mounts.
1807 */
1808 if (shmem_huge == SHMEM_HUGE_DENY)
1809 return 0;
1810
1811 /*
1812 * Only allow inherit orders if the top-level value is 'force', which
1813 * means non-PMD sized THP can not override 'huge' mount option now.
1814 */
1815 if (shmem_huge == SHMEM_HUGE_FORCE)
1816 return READ_ONCE(huge_shmem_orders_inherit);
1817
1818 /* Allow mTHP that will be fully within i_size. */
1819 mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0);
1820
1821 if (vm_flags & VM_HUGEPAGE)
1822 mask |= READ_ONCE(huge_shmem_orders_madvise);
1823
1824 if (global_orders > 0)
1825 mask |= READ_ONCE(huge_shmem_orders_inherit);
1826
1827 return THP_ORDERS_ALL_FILE_DEFAULT & mask;
1828 }
1829
shmem_suitable_orders(struct inode * inode,struct vm_fault * vmf,struct address_space * mapping,pgoff_t index,unsigned long orders)1830 static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
1831 struct address_space *mapping, pgoff_t index,
1832 unsigned long orders)
1833 {
1834 struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
1835 pgoff_t aligned_index;
1836 unsigned long pages;
1837 int order;
1838
1839 if (vma) {
1840 orders = thp_vma_suitable_orders(vma, vmf->address, orders);
1841 if (!orders)
1842 return 0;
1843 }
1844
1845 /* Find the highest order that can add into the page cache */
1846 order = highest_order(orders);
1847 while (orders) {
1848 pages = 1UL << order;
1849 aligned_index = round_down(index, pages);
1850 /*
1851 * Check for conflict before waiting on a huge allocation.
1852 * Conflict might be that a huge page has just been allocated
1853 * and added to page cache by a racing thread, or that there
1854 * is already at least one small page in the huge extent.
1855 * Be careful to retry when appropriate, but not forever!
1856 * Elsewhere -EEXIST would be the right code, but not here.
1857 */
1858 if (!xa_find(&mapping->i_pages, &aligned_index,
1859 aligned_index + pages - 1, XA_PRESENT))
1860 break;
1861 order = next_order(&orders, order);
1862 }
1863
1864 return orders;
1865 }
1866 #else
shmem_suitable_orders(struct inode * inode,struct vm_fault * vmf,struct address_space * mapping,pgoff_t index,unsigned long orders)1867 static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
1868 struct address_space *mapping, pgoff_t index,
1869 unsigned long orders)
1870 {
1871 return 0;
1872 }
1873 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1874
shmem_alloc_folio(gfp_t gfp,int order,struct shmem_inode_info * info,pgoff_t index)1875 static struct folio *shmem_alloc_folio(gfp_t gfp, int order,
1876 struct shmem_inode_info *info, pgoff_t index)
1877 {
1878 struct mempolicy *mpol;
1879 pgoff_t ilx;
1880 struct folio *folio;
1881
1882 mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
1883 folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id());
1884 mpol_cond_put(mpol);
1885
1886 return folio;
1887 }
1888
shmem_alloc_and_add_folio(struct vm_fault * vmf,gfp_t gfp,struct inode * inode,pgoff_t index,struct mm_struct * fault_mm,unsigned long orders)1889 static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
1890 gfp_t gfp, struct inode *inode, pgoff_t index,
1891 struct mm_struct *fault_mm, unsigned long orders)
1892 {
1893 struct address_space *mapping = inode->i_mapping;
1894 struct shmem_inode_info *info = SHMEM_I(inode);
1895 unsigned long suitable_orders = 0;
1896 struct folio *folio = NULL;
1897 long pages;
1898 int error, order;
1899
1900 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1901 orders = 0;
1902
1903 if (orders > 0) {
1904 suitable_orders = shmem_suitable_orders(inode, vmf,
1905 mapping, index, orders);
1906
1907 order = highest_order(suitable_orders);
1908 while (suitable_orders) {
1909 pages = 1UL << order;
1910 index = round_down(index, pages);
1911 folio = shmem_alloc_folio(gfp, order, info, index);
1912 if (folio)
1913 goto allocated;
1914
1915 if (pages == HPAGE_PMD_NR)
1916 count_vm_event(THP_FILE_FALLBACK);
1917 count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK);
1918 order = next_order(&suitable_orders, order);
1919 }
1920 } else {
1921 pages = 1;
1922 folio = shmem_alloc_folio(gfp, 0, info, index);
1923 }
1924 if (!folio)
1925 return ERR_PTR(-ENOMEM);
1926
1927 allocated:
1928 __folio_set_locked(folio);
1929 __folio_set_swapbacked(folio);
1930
1931 gfp &= GFP_RECLAIM_MASK;
1932 error = mem_cgroup_charge(folio, fault_mm, gfp);
1933 if (error) {
1934 if (xa_find(&mapping->i_pages, &index,
1935 index + pages - 1, XA_PRESENT)) {
1936 error = -EEXIST;
1937 } else if (pages > 1) {
1938 if (pages == HPAGE_PMD_NR) {
1939 count_vm_event(THP_FILE_FALLBACK);
1940 count_vm_event(THP_FILE_FALLBACK_CHARGE);
1941 }
1942 count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK);
1943 count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE);
1944 }
1945 goto unlock;
1946 }
1947
1948 error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
1949 if (error)
1950 goto unlock;
1951
1952 error = shmem_inode_acct_blocks(inode, pages);
1953 if (error) {
1954 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1955 long freed;
1956 /*
1957 * Try to reclaim some space by splitting a few
1958 * large folios beyond i_size on the filesystem.
1959 */
1960 shmem_unused_huge_shrink(sbinfo, NULL, pages);
1961 /*
1962 * And do a shmem_recalc_inode() to account for freed pages:
1963 * except our folio is there in cache, so not quite balanced.
1964 */
1965 spin_lock(&info->lock);
1966 freed = pages + info->alloced - info->swapped -
1967 READ_ONCE(mapping->nrpages);
1968 if (freed > 0)
1969 info->alloced -= freed;
1970 spin_unlock(&info->lock);
1971 if (freed > 0)
1972 shmem_inode_unacct_blocks(inode, freed);
1973 error = shmem_inode_acct_blocks(inode, pages);
1974 if (error) {
1975 filemap_remove_folio(folio);
1976 goto unlock;
1977 }
1978 }
1979
1980 shmem_recalc_inode(inode, pages, 0);
1981 folio_add_lru(folio);
1982 return folio;
1983
1984 unlock:
1985 folio_unlock(folio);
1986 folio_put(folio);
1987 return ERR_PTR(error);
1988 }
1989
shmem_swap_alloc_folio(struct inode * inode,struct vm_area_struct * vma,pgoff_t index,swp_entry_t entry,int order,gfp_t gfp)1990 static struct folio *shmem_swap_alloc_folio(struct inode *inode,
1991 struct vm_area_struct *vma, pgoff_t index,
1992 swp_entry_t entry, int order, gfp_t gfp)
1993 {
1994 struct shmem_inode_info *info = SHMEM_I(inode);
1995 struct folio *new;
1996 void *shadow;
1997 int nr_pages;
1998
1999 /*
2000 * We have arrived here because our zones are constrained, so don't
2001 * limit chance of success with further cpuset and node constraints.
2002 */
2003 gfp &= ~GFP_CONSTRAINT_MASK;
2004 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) {
2005 gfp_t huge_gfp = vma_thp_gfp_mask(vma);
2006
2007 gfp = limit_gfp_mask(huge_gfp, gfp);
2008 }
2009
2010 new = shmem_alloc_folio(gfp, order, info, index);
2011 if (!new)
2012 return ERR_PTR(-ENOMEM);
2013
2014 nr_pages = folio_nr_pages(new);
2015 if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
2016 gfp, entry)) {
2017 folio_put(new);
2018 return ERR_PTR(-ENOMEM);
2019 }
2020
2021 /*
2022 * Prevent parallel swapin from proceeding with the swap cache flag.
2023 *
2024 * Of course there is another possible concurrent scenario as well,
2025 * that is to say, the swap cache flag of a large folio has already
2026 * been set by swapcache_prepare(), while another thread may have
2027 * already split the large swap entry stored in the shmem mapping.
2028 * In this case, shmem_add_to_page_cache() will help identify the
2029 * concurrent swapin and return -EEXIST.
2030 */
2031 if (swapcache_prepare(entry, nr_pages)) {
2032 folio_put(new);
2033 return ERR_PTR(-EEXIST);
2034 }
2035
2036 __folio_set_locked(new);
2037 __folio_set_swapbacked(new);
2038 new->swap = entry;
2039
2040 memcg1_swapin(entry, nr_pages);
2041 shadow = get_shadow_from_swap_cache(entry);
2042 if (shadow)
2043 workingset_refault(new, shadow);
2044 folio_add_lru(new);
2045 swap_read_folio(new, NULL);
2046 return new;
2047 }
2048
2049 /*
2050 * When a page is moved from swapcache to shmem filecache (either by the
2051 * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
2052 * shmem_unuse_inode()), it may have been read in earlier from swap, in
2053 * ignorance of the mapping it belongs to. If that mapping has special
2054 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
2055 * we may need to copy to a suitable page before moving to filecache.
2056 *
2057 * In a future release, this may well be extended to respect cpuset and
2058 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
2059 * but for now it is a simple matter of zone.
2060 */
shmem_should_replace_folio(struct folio * folio,gfp_t gfp)2061 static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
2062 {
2063 return folio_zonenum(folio) > gfp_zone(gfp);
2064 }
2065
shmem_replace_folio(struct folio ** foliop,gfp_t gfp,struct shmem_inode_info * info,pgoff_t index,struct vm_area_struct * vma)2066 static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
2067 struct shmem_inode_info *info, pgoff_t index,
2068 struct vm_area_struct *vma)
2069 {
2070 struct folio *new, *old = *foliop;
2071 swp_entry_t entry = old->swap;
2072 struct address_space *swap_mapping = swap_address_space(entry);
2073 pgoff_t swap_index = swap_cache_index(entry);
2074 XA_STATE(xas, &swap_mapping->i_pages, swap_index);
2075 int nr_pages = folio_nr_pages(old);
2076 int error = 0, i;
2077
2078 /*
2079 * We have arrived here because our zones are constrained, so don't
2080 * limit chance of success by further cpuset and node constraints.
2081 */
2082 gfp &= ~GFP_CONSTRAINT_MASK;
2083 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2084 if (nr_pages > 1) {
2085 gfp_t huge_gfp = vma_thp_gfp_mask(vma);
2086
2087 gfp = limit_gfp_mask(huge_gfp, gfp);
2088 }
2089 #endif
2090
2091 new = shmem_alloc_folio(gfp, folio_order(old), info, index);
2092 if (!new)
2093 return -ENOMEM;
2094
2095 folio_ref_add(new, nr_pages);
2096 folio_copy(new, old);
2097 flush_dcache_folio(new);
2098
2099 __folio_set_locked(new);
2100 __folio_set_swapbacked(new);
2101 folio_mark_uptodate(new);
2102 new->swap = entry;
2103 folio_set_swapcache(new);
2104
2105 /* Swap cache still stores N entries instead of a high-order entry */
2106 xa_lock_irq(&swap_mapping->i_pages);
2107 for (i = 0; i < nr_pages; i++) {
2108 void *item = xas_load(&xas);
2109
2110 if (item != old) {
2111 error = -ENOENT;
2112 break;
2113 }
2114
2115 xas_store(&xas, new);
2116 xas_next(&xas);
2117 }
2118 if (!error) {
2119 mem_cgroup_replace_folio(old, new);
2120 shmem_update_stats(new, nr_pages);
2121 shmem_update_stats(old, -nr_pages);
2122 }
2123 xa_unlock_irq(&swap_mapping->i_pages);
2124
2125 if (unlikely(error)) {
2126 /*
2127 * Is this possible? I think not, now that our callers
2128 * check both the swapcache flag and folio->private
2129 * after getting the folio lock; but be defensive.
2130 * Reverse old to newpage for clear and free.
2131 */
2132 old = new;
2133 } else {
2134 folio_add_lru(new);
2135 *foliop = new;
2136 }
2137
2138 folio_clear_swapcache(old);
2139 old->private = NULL;
2140
2141 folio_unlock(old);
2142 /*
2143 * The old folio are removed from swap cache, drop the 'nr_pages'
2144 * reference, as well as one temporary reference getting from swap
2145 * cache.
2146 */
2147 folio_put_refs(old, nr_pages + 1);
2148 return error;
2149 }
2150
shmem_set_folio_swapin_error(struct inode * inode,pgoff_t index,struct folio * folio,swp_entry_t swap,bool skip_swapcache)2151 static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
2152 struct folio *folio, swp_entry_t swap,
2153 bool skip_swapcache)
2154 {
2155 struct address_space *mapping = inode->i_mapping;
2156 swp_entry_t swapin_error;
2157 void *old;
2158 int nr_pages;
2159
2160 swapin_error = make_poisoned_swp_entry();
2161 old = xa_cmpxchg_irq(&mapping->i_pages, index,
2162 swp_to_radix_entry(swap),
2163 swp_to_radix_entry(swapin_error), 0);
2164 if (old != swp_to_radix_entry(swap))
2165 return;
2166
2167 nr_pages = folio_nr_pages(folio);
2168 folio_wait_writeback(folio);
2169 if (!skip_swapcache)
2170 delete_from_swap_cache(folio);
2171 /*
2172 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
2173 * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
2174 * in shmem_evict_inode().
2175 */
2176 shmem_recalc_inode(inode, -nr_pages, -nr_pages);
2177 swap_free_nr(swap, nr_pages);
2178 }
2179
shmem_split_large_entry(struct inode * inode,pgoff_t index,swp_entry_t swap,gfp_t gfp)2180 static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
2181 swp_entry_t swap, gfp_t gfp)
2182 {
2183 struct address_space *mapping = inode->i_mapping;
2184 XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
2185 int split_order = 0, entry_order;
2186 int i;
2187
2188 /* Convert user data gfp flags to xarray node gfp flags */
2189 gfp &= GFP_RECLAIM_MASK;
2190
2191 for (;;) {
2192 void *old = NULL;
2193 int cur_order;
2194 pgoff_t swap_index;
2195
2196 xas_lock_irq(&xas);
2197 old = xas_load(&xas);
2198 if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) {
2199 xas_set_err(&xas, -EEXIST);
2200 goto unlock;
2201 }
2202
2203 entry_order = xas_get_order(&xas);
2204
2205 if (!entry_order)
2206 goto unlock;
2207
2208 /* Try to split large swap entry in pagecache */
2209 cur_order = entry_order;
2210 swap_index = round_down(index, 1 << entry_order);
2211
2212 split_order = xas_try_split_min_order(cur_order);
2213
2214 while (cur_order > 0) {
2215 pgoff_t aligned_index =
2216 round_down(index, 1 << cur_order);
2217 pgoff_t swap_offset = aligned_index - swap_index;
2218
2219 xas_set_order(&xas, index, split_order);
2220 xas_try_split(&xas, old, cur_order);
2221 if (xas_error(&xas))
2222 goto unlock;
2223
2224 /*
2225 * Re-set the swap entry after splitting, and the swap
2226 * offset of the original large entry must be continuous.
2227 */
2228 for (i = 0; i < 1 << cur_order;
2229 i += (1 << split_order)) {
2230 swp_entry_t tmp;
2231
2232 tmp = swp_entry(swp_type(swap),
2233 swp_offset(swap) + swap_offset +
2234 i);
2235 __xa_store(&mapping->i_pages, aligned_index + i,
2236 swp_to_radix_entry(tmp), 0);
2237 }
2238 cur_order = split_order;
2239 split_order = xas_try_split_min_order(split_order);
2240 }
2241
2242 unlock:
2243 xas_unlock_irq(&xas);
2244
2245 if (!xas_nomem(&xas, gfp))
2246 break;
2247 }
2248
2249 if (xas_error(&xas))
2250 return xas_error(&xas);
2251
2252 return entry_order;
2253 }
2254
2255 /*
2256 * Swap in the folio pointed to by *foliop.
2257 * Caller has to make sure that *foliop contains a valid swapped folio.
2258 * Returns 0 and the folio in foliop if success. On failure, returns the
2259 * error code and NULL in *foliop.
2260 */
shmem_swapin_folio(struct inode * inode,pgoff_t index,struct folio ** foliop,enum sgp_type sgp,gfp_t gfp,struct vm_area_struct * vma,vm_fault_t * fault_type)2261 static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
2262 struct folio **foliop, enum sgp_type sgp,
2263 gfp_t gfp, struct vm_area_struct *vma,
2264 vm_fault_t *fault_type)
2265 {
2266 struct address_space *mapping = inode->i_mapping;
2267 struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
2268 struct shmem_inode_info *info = SHMEM_I(inode);
2269 struct swap_info_struct *si;
2270 struct folio *folio = NULL;
2271 bool skip_swapcache = false;
2272 swp_entry_t swap;
2273 int error, nr_pages, order, split_order;
2274
2275 VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
2276 swap = radix_to_swp_entry(*foliop);
2277 *foliop = NULL;
2278
2279 if (is_poisoned_swp_entry(swap))
2280 return -EIO;
2281
2282 si = get_swap_device(swap);
2283 if (!si) {
2284 if (!shmem_confirm_swap(mapping, index, swap))
2285 return -EEXIST;
2286 else
2287 return -EINVAL;
2288 }
2289
2290 /* Look it up and read it in.. */
2291 folio = swap_cache_get_folio(swap, NULL, 0);
2292 order = xa_get_order(&mapping->i_pages, index);
2293 if (!folio) {
2294 int nr_pages = 1 << order;
2295 bool fallback_order0 = false;
2296
2297 /* Or update major stats only when swapin succeeds?? */
2298 if (fault_type) {
2299 *fault_type |= VM_FAULT_MAJOR;
2300 count_vm_event(PGMAJFAULT);
2301 count_memcg_event_mm(fault_mm, PGMAJFAULT);
2302 }
2303
2304 /*
2305 * If uffd is active for the vma, we need per-page fault
2306 * fidelity to maintain the uffd semantics, then fallback
2307 * to swapin order-0 folio, as well as for zswap case.
2308 * Any existing sub folio in the swap cache also blocks
2309 * mTHP swapin.
2310 */
2311 if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
2312 !zswap_never_enabled() ||
2313 non_swapcache_batch(swap, nr_pages) != nr_pages))
2314 fallback_order0 = true;
2315
2316 /* Skip swapcache for synchronous device. */
2317 if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
2318 folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
2319 if (!IS_ERR(folio)) {
2320 skip_swapcache = true;
2321 goto alloced;
2322 }
2323
2324 /*
2325 * Fallback to swapin order-0 folio unless the swap entry
2326 * already exists.
2327 */
2328 error = PTR_ERR(folio);
2329 folio = NULL;
2330 if (error == -EEXIST)
2331 goto failed;
2332 }
2333
2334 /*
2335 * Now swap device can only swap in order 0 folio, then we
2336 * should split the large swap entry stored in the pagecache
2337 * if necessary.
2338 */
2339 split_order = shmem_split_large_entry(inode, index, swap, gfp);
2340 if (split_order < 0) {
2341 error = split_order;
2342 goto failed;
2343 }
2344
2345 /*
2346 * If the large swap entry has already been split, it is
2347 * necessary to recalculate the new swap entry based on
2348 * the old order alignment.
2349 */
2350 if (split_order > 0) {
2351 pgoff_t offset = index - round_down(index, 1 << split_order);
2352
2353 swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
2354 }
2355
2356 /* Here we actually start the io */
2357 folio = shmem_swapin_cluster(swap, gfp, info, index);
2358 if (!folio) {
2359 error = -ENOMEM;
2360 goto failed;
2361 }
2362 } else if (order != folio_order(folio)) {
2363 /*
2364 * Swap readahead may swap in order 0 folios into swapcache
2365 * asynchronously, while the shmem mapping can still stores
2366 * large swap entries. In such cases, we should split the
2367 * large swap entry to prevent possible data corruption.
2368 */
2369 split_order = shmem_split_large_entry(inode, index, swap, gfp);
2370 if (split_order < 0) {
2371 folio_put(folio);
2372 folio = NULL;
2373 error = split_order;
2374 goto failed;
2375 }
2376
2377 /*
2378 * If the large swap entry has already been split, it is
2379 * necessary to recalculate the new swap entry based on
2380 * the old order alignment.
2381 */
2382 if (split_order > 0) {
2383 pgoff_t offset = index - round_down(index, 1 << split_order);
2384
2385 swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
2386 }
2387 }
2388
2389 alloced:
2390 /* We have to do this with folio locked to prevent races */
2391 folio_lock(folio);
2392 if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
2393 folio->swap.val != swap.val ||
2394 !shmem_confirm_swap(mapping, index, swap) ||
2395 xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
2396 error = -EEXIST;
2397 goto unlock;
2398 }
2399 if (!folio_test_uptodate(folio)) {
2400 error = -EIO;
2401 goto failed;
2402 }
2403 folio_wait_writeback(folio);
2404 nr_pages = folio_nr_pages(folio);
2405
2406 /*
2407 * Some architectures may have to restore extra metadata to the
2408 * folio after reading from swap.
2409 */
2410 arch_swap_restore(folio_swap(swap, folio), folio);
2411
2412 if (shmem_should_replace_folio(folio, gfp)) {
2413 error = shmem_replace_folio(&folio, gfp, info, index, vma);
2414 if (error)
2415 goto failed;
2416 }
2417
2418 error = shmem_add_to_page_cache(folio, mapping,
2419 round_down(index, nr_pages),
2420 swp_to_radix_entry(swap), gfp);
2421 if (error)
2422 goto failed;
2423
2424 shmem_recalc_inode(inode, 0, -nr_pages);
2425
2426 if (sgp == SGP_WRITE)
2427 folio_mark_accessed(folio);
2428
2429 if (skip_swapcache) {
2430 folio->swap.val = 0;
2431 swapcache_clear(si, swap, nr_pages);
2432 } else {
2433 delete_from_swap_cache(folio);
2434 }
2435 folio_mark_dirty(folio);
2436 swap_free_nr(swap, nr_pages);
2437 put_swap_device(si);
2438
2439 *foliop = folio;
2440 return 0;
2441 failed:
2442 if (!shmem_confirm_swap(mapping, index, swap))
2443 error = -EEXIST;
2444 if (error == -EIO)
2445 shmem_set_folio_swapin_error(inode, index, folio, swap,
2446 skip_swapcache);
2447 unlock:
2448 if (skip_swapcache)
2449 swapcache_clear(si, swap, folio_nr_pages(folio));
2450 if (folio) {
2451 folio_unlock(folio);
2452 folio_put(folio);
2453 }
2454 put_swap_device(si);
2455
2456 return error;
2457 }
2458
2459 /*
2460 * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
2461 *
2462 * If we allocate a new one we do not mark it dirty. That's up to the
2463 * vm. If we swap it in we mark it dirty since we also free the swap
2464 * entry since a page cannot live in both the swap and page cache.
2465 *
2466 * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
2467 */
shmem_get_folio_gfp(struct inode * inode,pgoff_t index,loff_t write_end,struct folio ** foliop,enum sgp_type sgp,gfp_t gfp,struct vm_fault * vmf,vm_fault_t * fault_type)2468 static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
2469 loff_t write_end, struct folio **foliop, enum sgp_type sgp,
2470 gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
2471 {
2472 struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
2473 struct mm_struct *fault_mm;
2474 struct folio *folio;
2475 int error;
2476 bool alloced;
2477 unsigned long orders = 0;
2478
2479 if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
2480 return -EINVAL;
2481
2482 if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
2483 return -EFBIG;
2484 repeat:
2485 if (sgp <= SGP_CACHE &&
2486 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
2487 return -EINVAL;
2488
2489 alloced = false;
2490 fault_mm = vma ? vma->vm_mm : NULL;
2491
2492 folio = filemap_get_entry(inode->i_mapping, index);
2493 if (folio && vma && userfaultfd_minor(vma)) {
2494 if (!xa_is_value(folio))
2495 folio_put(folio);
2496 *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
2497 return 0;
2498 }
2499
2500 if (xa_is_value(folio)) {
2501 error = shmem_swapin_folio(inode, index, &folio,
2502 sgp, gfp, vma, fault_type);
2503 if (error == -EEXIST)
2504 goto repeat;
2505
2506 *foliop = folio;
2507 return error;
2508 }
2509
2510 if (folio) {
2511 folio_lock(folio);
2512
2513 /* Has the folio been truncated or swapped out? */
2514 if (unlikely(folio->mapping != inode->i_mapping)) {
2515 folio_unlock(folio);
2516 folio_put(folio);
2517 goto repeat;
2518 }
2519 if (sgp == SGP_WRITE)
2520 folio_mark_accessed(folio);
2521 if (folio_test_uptodate(folio))
2522 goto out;
2523 /* fallocated folio */
2524 if (sgp != SGP_READ)
2525 goto clear;
2526 folio_unlock(folio);
2527 folio_put(folio);
2528 }
2529
2530 /*
2531 * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
2532 * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
2533 */
2534 *foliop = NULL;
2535 if (sgp == SGP_READ)
2536 return 0;
2537 if (sgp == SGP_NOALLOC)
2538 return -ENOENT;
2539
2540 /*
2541 * Fast cache lookup and swap lookup did not find it: allocate.
2542 */
2543
2544 if (vma && userfaultfd_missing(vma)) {
2545 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
2546 return 0;
2547 }
2548
2549 /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */
2550 orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false);
2551 if (orders > 0) {
2552 gfp_t huge_gfp;
2553
2554 huge_gfp = vma_thp_gfp_mask(vma);
2555 huge_gfp = limit_gfp_mask(huge_gfp, gfp);
2556 folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
2557 inode, index, fault_mm, orders);
2558 if (!IS_ERR(folio)) {
2559 if (folio_test_pmd_mappable(folio))
2560 count_vm_event(THP_FILE_ALLOC);
2561 count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC);
2562 goto alloced;
2563 }
2564 if (PTR_ERR(folio) == -EEXIST)
2565 goto repeat;
2566 }
2567
2568 folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0);
2569 if (IS_ERR(folio)) {
2570 error = PTR_ERR(folio);
2571 if (error == -EEXIST)
2572 goto repeat;
2573 folio = NULL;
2574 goto unlock;
2575 }
2576
2577 alloced:
2578 alloced = true;
2579 if (folio_test_large(folio) &&
2580 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
2581 folio_next_index(folio)) {
2582 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2583 struct shmem_inode_info *info = SHMEM_I(inode);
2584 /*
2585 * Part of the large folio is beyond i_size: subject
2586 * to shrink under memory pressure.
2587 */
2588 spin_lock(&sbinfo->shrinklist_lock);
2589 /*
2590 * _careful to defend against unlocked access to
2591 * ->shrink_list in shmem_unused_huge_shrink()
2592 */
2593 if (list_empty_careful(&info->shrinklist)) {
2594 list_add_tail(&info->shrinklist,
2595 &sbinfo->shrinklist);
2596 sbinfo->shrinklist_len++;
2597 }
2598 spin_unlock(&sbinfo->shrinklist_lock);
2599 }
2600
2601 if (sgp == SGP_WRITE)
2602 folio_set_referenced(folio);
2603 /*
2604 * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
2605 */
2606 if (sgp == SGP_FALLOC)
2607 sgp = SGP_WRITE;
2608 clear:
2609 /*
2610 * Let SGP_WRITE caller clear ends if write does not fill folio;
2611 * but SGP_FALLOC on a folio fallocated earlier must initialize
2612 * it now, lest undo on failure cancel our earlier guarantee.
2613 */
2614 if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
2615 long i, n = folio_nr_pages(folio);
2616
2617 for (i = 0; i < n; i++)
2618 clear_highpage(folio_page(folio, i));
2619 flush_dcache_folio(folio);
2620 folio_mark_uptodate(folio);
2621 }
2622
2623 /* Perhaps the file has been truncated since we checked */
2624 if (sgp <= SGP_CACHE &&
2625 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
2626 error = -EINVAL;
2627 goto unlock;
2628 }
2629 out:
2630 *foliop = folio;
2631 return 0;
2632
2633 /*
2634 * Error recovery.
2635 */
2636 unlock:
2637 if (alloced)
2638 filemap_remove_folio(folio);
2639 shmem_recalc_inode(inode, 0, 0);
2640 if (folio) {
2641 folio_unlock(folio);
2642 folio_put(folio);
2643 }
2644 return error;
2645 }
2646
2647 /**
2648 * shmem_get_folio - find, and lock a shmem folio.
2649 * @inode: inode to search
2650 * @index: the page index.
2651 * @write_end: end of a write, could extend inode size
2652 * @foliop: pointer to the folio if found
2653 * @sgp: SGP_* flags to control behavior
2654 *
2655 * Looks up the page cache entry at @inode & @index. If a folio is
2656 * present, it is returned locked with an increased refcount.
2657 *
2658 * If the caller modifies data in the folio, it must call folio_mark_dirty()
2659 * before unlocking the folio to ensure that the folio is not reclaimed.
2660 * There is no need to reserve space before calling folio_mark_dirty().
2661 *
2662 * When no folio is found, the behavior depends on @sgp:
2663 * - for SGP_READ, *@foliop is %NULL and 0 is returned
2664 * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned
2665 * - for all other flags a new folio is allocated, inserted into the
2666 * page cache and returned locked in @foliop.
2667 *
2668 * Context: May sleep.
2669 * Return: 0 if successful, else a negative error code.
2670 */
shmem_get_folio(struct inode * inode,pgoff_t index,loff_t write_end,struct folio ** foliop,enum sgp_type sgp)2671 int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
2672 struct folio **foliop, enum sgp_type sgp)
2673 {
2674 return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
2675 mapping_gfp_mask(inode->i_mapping), NULL, NULL);
2676 }
2677 EXPORT_SYMBOL_GPL(shmem_get_folio);
2678
2679 /*
2680 * This is like autoremove_wake_function, but it removes the wait queue
2681 * entry unconditionally - even if something else had already woken the
2682 * target.
2683 */
synchronous_wake_function(wait_queue_entry_t * wait,unsigned int mode,int sync,void * key)2684 static int synchronous_wake_function(wait_queue_entry_t *wait,
2685 unsigned int mode, int sync, void *key)
2686 {
2687 int ret = default_wake_function(wait, mode, sync, key);
2688 list_del_init(&wait->entry);
2689 return ret;
2690 }
2691
2692 /*
2693 * Trinity finds that probing a hole which tmpfs is punching can
2694 * prevent the hole-punch from ever completing: which in turn
2695 * locks writers out with its hold on i_rwsem. So refrain from
2696 * faulting pages into the hole while it's being punched. Although
2697 * shmem_undo_range() does remove the additions, it may be unable to
2698 * keep up, as each new page needs its own unmap_mapping_range() call,
2699 * and the i_mmap tree grows ever slower to scan if new vmas are added.
2700 *
2701 * It does not matter if we sometimes reach this check just before the
2702 * hole-punch begins, so that one fault then races with the punch:
2703 * we just need to make racing faults a rare case.
2704 *
2705 * The implementation below would be much simpler if we just used a
2706 * standard mutex or completion: but we cannot take i_rwsem in fault,
2707 * and bloating every shmem inode for this unlikely case would be sad.
2708 */
shmem_falloc_wait(struct vm_fault * vmf,struct inode * inode)2709 static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
2710 {
2711 struct shmem_falloc *shmem_falloc;
2712 struct file *fpin = NULL;
2713 vm_fault_t ret = 0;
2714
2715 spin_lock(&inode->i_lock);
2716 shmem_falloc = inode->i_private;
2717 if (shmem_falloc &&
2718 shmem_falloc->waitq &&
2719 vmf->pgoff >= shmem_falloc->start &&
2720 vmf->pgoff < shmem_falloc->next) {
2721 wait_queue_head_t *shmem_falloc_waitq;
2722 DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
2723
2724 ret = VM_FAULT_NOPAGE;
2725 fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2726 shmem_falloc_waitq = shmem_falloc->waitq;
2727 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
2728 TASK_UNINTERRUPTIBLE);
2729 spin_unlock(&inode->i_lock);
2730 schedule();
2731
2732 /*
2733 * shmem_falloc_waitq points into the shmem_fallocate()
2734 * stack of the hole-punching task: shmem_falloc_waitq
2735 * is usually invalid by the time we reach here, but
2736 * finish_wait() does not dereference it in that case;
2737 * though i_lock needed lest racing with wake_up_all().
2738 */
2739 spin_lock(&inode->i_lock);
2740 finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2741 }
2742 spin_unlock(&inode->i_lock);
2743 if (fpin) {
2744 fput(fpin);
2745 ret = VM_FAULT_RETRY;
2746 }
2747 return ret;
2748 }
2749
shmem_fault(struct vm_fault * vmf)2750 static vm_fault_t shmem_fault(struct vm_fault *vmf)
2751 {
2752 struct inode *inode = file_inode(vmf->vma->vm_file);
2753 gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
2754 struct folio *folio = NULL;
2755 vm_fault_t ret = 0;
2756 int err;
2757
2758 /*
2759 * Trinity finds that probing a hole which tmpfs is punching can
2760 * prevent the hole-punch from ever completing: noted in i_private.
2761 */
2762 if (unlikely(inode->i_private)) {
2763 ret = shmem_falloc_wait(vmf, inode);
2764 if (ret)
2765 return ret;
2766 }
2767
2768 WARN_ON_ONCE(vmf->page != NULL);
2769 err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
2770 gfp, vmf, &ret);
2771 if (err)
2772 return vmf_error(err);
2773 if (folio) {
2774 vmf->page = folio_file_page(folio, vmf->pgoff);
2775 ret |= VM_FAULT_LOCKED;
2776 }
2777 return ret;
2778 }
2779
shmem_get_unmapped_area(struct file * file,unsigned long uaddr,unsigned long len,unsigned long pgoff,unsigned long flags)2780 unsigned long shmem_get_unmapped_area(struct file *file,
2781 unsigned long uaddr, unsigned long len,
2782 unsigned long pgoff, unsigned long flags)
2783 {
2784 unsigned long addr;
2785 unsigned long offset;
2786 unsigned long inflated_len;
2787 unsigned long inflated_addr;
2788 unsigned long inflated_offset;
2789 unsigned long hpage_size;
2790
2791 if (len > TASK_SIZE)
2792 return -ENOMEM;
2793
2794 addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff,
2795 flags);
2796
2797 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2798 return addr;
2799 if (IS_ERR_VALUE(addr))
2800 return addr;
2801 if (addr & ~PAGE_MASK)
2802 return addr;
2803 if (addr > TASK_SIZE - len)
2804 return addr;
2805
2806 if (shmem_huge == SHMEM_HUGE_DENY)
2807 return addr;
2808 if (flags & MAP_FIXED)
2809 return addr;
2810 /*
2811 * Our priority is to support MAP_SHARED mapped hugely;
2812 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
2813 * But if caller specified an address hint and we allocated area there
2814 * successfully, respect that as before.
2815 */
2816 if (uaddr == addr)
2817 return addr;
2818
2819 hpage_size = HPAGE_PMD_SIZE;
2820 if (shmem_huge != SHMEM_HUGE_FORCE) {
2821 struct super_block *sb;
2822 unsigned long __maybe_unused hpage_orders;
2823 int order = 0;
2824
2825 if (file) {
2826 VM_BUG_ON(file->f_op != &shmem_file_operations);
2827 sb = file_inode(file)->i_sb;
2828 } else {
2829 /*
2830 * Called directly from mm/mmap.c, or drivers/char/mem.c
2831 * for "/dev/zero", to create a shared anonymous object.
2832 */
2833 if (IS_ERR(shm_mnt))
2834 return addr;
2835 sb = shm_mnt->mnt_sb;
2836
2837 /*
2838 * Find the highest mTHP order used for anonymous shmem to
2839 * provide a suitable alignment address.
2840 */
2841 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2842 hpage_orders = READ_ONCE(huge_shmem_orders_always);
2843 hpage_orders |= READ_ONCE(huge_shmem_orders_within_size);
2844 hpage_orders |= READ_ONCE(huge_shmem_orders_madvise);
2845 if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
2846 hpage_orders |= READ_ONCE(huge_shmem_orders_inherit);
2847
2848 if (hpage_orders > 0) {
2849 order = highest_order(hpage_orders);
2850 hpage_size = PAGE_SIZE << order;
2851 }
2852 #endif
2853 }
2854 if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order)
2855 return addr;
2856 }
2857
2858 if (len < hpage_size)
2859 return addr;
2860
2861 offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1);
2862 if (offset && offset + len < 2 * hpage_size)
2863 return addr;
2864 if ((addr & (hpage_size - 1)) == offset)
2865 return addr;
2866
2867 inflated_len = len + hpage_size - PAGE_SIZE;
2868 if (inflated_len > TASK_SIZE)
2869 return addr;
2870 if (inflated_len < len)
2871 return addr;
2872
2873 inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr,
2874 inflated_len, 0, flags);
2875 if (IS_ERR_VALUE(inflated_addr))
2876 return addr;
2877 if (inflated_addr & ~PAGE_MASK)
2878 return addr;
2879
2880 inflated_offset = inflated_addr & (hpage_size - 1);
2881 inflated_addr += offset - inflated_offset;
2882 if (inflated_offset > offset)
2883 inflated_addr += hpage_size;
2884
2885 if (inflated_addr > TASK_SIZE - len)
2886 return addr;
2887 return inflated_addr;
2888 }
2889
2890 #ifdef CONFIG_NUMA
shmem_set_policy(struct vm_area_struct * vma,struct mempolicy * mpol)2891 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
2892 {
2893 struct inode *inode = file_inode(vma->vm_file);
2894 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
2895 }
2896
shmem_get_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)2897 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2898 unsigned long addr, pgoff_t *ilx)
2899 {
2900 struct inode *inode = file_inode(vma->vm_file);
2901 pgoff_t index;
2902
2903 /*
2904 * Bias interleave by inode number to distribute better across nodes;
2905 * but this interface is independent of which page order is used, so
2906 * supplies only that bias, letting caller apply the offset (adjusted
2907 * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
2908 */
2909 *ilx = inode->i_ino;
2910 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2911 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
2912 }
2913
shmem_get_pgoff_policy(struct shmem_inode_info * info,pgoff_t index,unsigned int order,pgoff_t * ilx)2914 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
2915 pgoff_t index, unsigned int order, pgoff_t *ilx)
2916 {
2917 struct mempolicy *mpol;
2918
2919 /* Bias interleave by inode number to distribute better across nodes */
2920 *ilx = info->vfs_inode.i_ino + (index >> order);
2921
2922 mpol = mpol_shared_policy_lookup(&info->policy, index);
2923 return mpol ? mpol : get_task_policy(current);
2924 }
2925 #else
shmem_get_pgoff_policy(struct shmem_inode_info * info,pgoff_t index,unsigned int order,pgoff_t * ilx)2926 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
2927 pgoff_t index, unsigned int order, pgoff_t *ilx)
2928 {
2929 *ilx = 0;
2930 return NULL;
2931 }
2932 #endif /* CONFIG_NUMA */
2933
shmem_lock(struct file * file,int lock,struct ucounts * ucounts)2934 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
2935 {
2936 struct inode *inode = file_inode(file);
2937 struct shmem_inode_info *info = SHMEM_I(inode);
2938 int retval = -ENOMEM;
2939
2940 /*
2941 * What serializes the accesses to info->flags?
2942 * ipc_lock_object() when called from shmctl_do_lock(),
2943 * no serialization needed when called from shm_destroy().
2944 */
2945 if (lock && !(info->flags & VM_LOCKED)) {
2946 if (!user_shm_lock(inode->i_size, ucounts))
2947 goto out_nomem;
2948 info->flags |= VM_LOCKED;
2949 mapping_set_unevictable(file->f_mapping);
2950 }
2951 if (!lock && (info->flags & VM_LOCKED) && ucounts) {
2952 user_shm_unlock(inode->i_size, ucounts);
2953 info->flags &= ~VM_LOCKED;
2954 mapping_clear_unevictable(file->f_mapping);
2955 }
2956 retval = 0;
2957
2958 out_nomem:
2959 return retval;
2960 }
2961
shmem_mmap(struct file * file,struct vm_area_struct * vma)2962 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
2963 {
2964 struct inode *inode = file_inode(file);
2965
2966 file_accessed(file);
2967 /* This is anonymous shared memory if it is unlinked at the time of mmap */
2968 if (inode->i_nlink)
2969 vma->vm_ops = &shmem_vm_ops;
2970 else
2971 vma->vm_ops = &shmem_anon_vm_ops;
2972 return 0;
2973 }
2974
shmem_file_open(struct inode * inode,struct file * file)2975 static int shmem_file_open(struct inode *inode, struct file *file)
2976 {
2977 file->f_mode |= FMODE_CAN_ODIRECT;
2978 return generic_file_open(inode, file);
2979 }
2980
2981 #ifdef CONFIG_TMPFS_XATTR
2982 static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2983
2984 #if IS_ENABLED(CONFIG_UNICODE)
2985 /*
2986 * shmem_inode_casefold_flags - Deal with casefold file attribute flag
2987 *
2988 * The casefold file attribute needs some special checks. I can just be added to
2989 * an empty dir, and can't be removed from a non-empty dir.
2990 */
shmem_inode_casefold_flags(struct inode * inode,unsigned int fsflags,struct dentry * dentry,unsigned int * i_flags)2991 static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
2992 struct dentry *dentry, unsigned int *i_flags)
2993 {
2994 unsigned int old = inode->i_flags;
2995 struct super_block *sb = inode->i_sb;
2996
2997 if (fsflags & FS_CASEFOLD_FL) {
2998 if (!(old & S_CASEFOLD)) {
2999 if (!sb->s_encoding)
3000 return -EOPNOTSUPP;
3001
3002 if (!S_ISDIR(inode->i_mode))
3003 return -ENOTDIR;
3004
3005 if (dentry && !simple_empty(dentry))
3006 return -ENOTEMPTY;
3007 }
3008
3009 *i_flags = *i_flags | S_CASEFOLD;
3010 } else if (old & S_CASEFOLD) {
3011 if (dentry && !simple_empty(dentry))
3012 return -ENOTEMPTY;
3013 }
3014
3015 return 0;
3016 }
3017 #else
shmem_inode_casefold_flags(struct inode * inode,unsigned int fsflags,struct dentry * dentry,unsigned int * i_flags)3018 static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
3019 struct dentry *dentry, unsigned int *i_flags)
3020 {
3021 if (fsflags & FS_CASEFOLD_FL)
3022 return -EOPNOTSUPP;
3023
3024 return 0;
3025 }
3026 #endif
3027
3028 /*
3029 * chattr's fsflags are unrelated to extended attributes,
3030 * but tmpfs has chosen to enable them under the same config option.
3031 */
shmem_set_inode_flags(struct inode * inode,unsigned int fsflags,struct dentry * dentry)3032 static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
3033 {
3034 unsigned int i_flags = 0;
3035 int ret;
3036
3037 ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags);
3038 if (ret)
3039 return ret;
3040
3041 if (fsflags & FS_NOATIME_FL)
3042 i_flags |= S_NOATIME;
3043 if (fsflags & FS_APPEND_FL)
3044 i_flags |= S_APPEND;
3045 if (fsflags & FS_IMMUTABLE_FL)
3046 i_flags |= S_IMMUTABLE;
3047 /*
3048 * But FS_NODUMP_FL does not require any action in i_flags.
3049 */
3050 inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD);
3051
3052 return 0;
3053 }
3054 #else
shmem_set_inode_flags(struct inode * inode,unsigned int fsflags,struct dentry * dentry)3055 static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
3056 {
3057 }
3058 #define shmem_initxattrs NULL
3059 #endif
3060
shmem_get_offset_ctx(struct inode * inode)3061 static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
3062 {
3063 return &SHMEM_I(inode)->dir_offsets;
3064 }
3065
__shmem_get_inode(struct mnt_idmap * idmap,struct super_block * sb,struct inode * dir,umode_t mode,dev_t dev,unsigned long flags)3066 static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
3067 struct super_block *sb,
3068 struct inode *dir, umode_t mode,
3069 dev_t dev, unsigned long flags)
3070 {
3071 struct inode *inode;
3072 struct shmem_inode_info *info;
3073 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3074 ino_t ino;
3075 int err;
3076
3077 err = shmem_reserve_inode(sb, &ino);
3078 if (err)
3079 return ERR_PTR(err);
3080
3081 inode = new_inode(sb);
3082 if (!inode) {
3083 shmem_free_inode(sb, 0);
3084 return ERR_PTR(-ENOSPC);
3085 }
3086
3087 inode->i_ino = ino;
3088 inode_init_owner(idmap, inode, dir, mode);
3089 inode->i_blocks = 0;
3090 simple_inode_init_ts(inode);
3091 inode->i_generation = get_random_u32();
3092 info = SHMEM_I(inode);
3093 memset(info, 0, (char *)inode - (char *)info);
3094 spin_lock_init(&info->lock);
3095 atomic_set(&info->stop_eviction, 0);
3096 info->seals = F_SEAL_SEAL;
3097 info->flags = flags & VM_NORESERVE;
3098 info->i_crtime = inode_get_mtime(inode);
3099 info->fsflags = (dir == NULL) ? 0 :
3100 SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
3101 if (info->fsflags)
3102 shmem_set_inode_flags(inode, info->fsflags, NULL);
3103 INIT_LIST_HEAD(&info->shrinklist);
3104 INIT_LIST_HEAD(&info->swaplist);
3105 simple_xattrs_init(&info->xattrs);
3106 cache_no_acl(inode);
3107 if (sbinfo->noswap)
3108 mapping_set_unevictable(inode->i_mapping);
3109
3110 /* Don't consider 'deny' for emergencies and 'force' for testing */
3111 if (sbinfo->huge)
3112 mapping_set_large_folios(inode->i_mapping);
3113
3114 switch (mode & S_IFMT) {
3115 default:
3116 inode->i_op = &shmem_special_inode_operations;
3117 init_special_inode(inode, mode, dev);
3118 break;
3119 case S_IFREG:
3120 inode->i_mapping->a_ops = &shmem_aops;
3121 inode->i_op = &shmem_inode_operations;
3122 inode->i_fop = &shmem_file_operations;
3123 mpol_shared_policy_init(&info->policy,
3124 shmem_get_sbmpol(sbinfo));
3125 break;
3126 case S_IFDIR:
3127 inc_nlink(inode);
3128 /* Some things misbehave if size == 0 on a directory */
3129 inode->i_size = 2 * BOGO_DIRENT_SIZE;
3130 inode->i_op = &shmem_dir_inode_operations;
3131 inode->i_fop = &simple_offset_dir_operations;
3132 simple_offset_init(shmem_get_offset_ctx(inode));
3133 break;
3134 case S_IFLNK:
3135 /*
3136 * Must not load anything in the rbtree,
3137 * mpol_free_shared_policy will not be called.
3138 */
3139 mpol_shared_policy_init(&info->policy, NULL);
3140 break;
3141 }
3142
3143 lockdep_annotate_inode_mutex_key(inode);
3144 return inode;
3145 }
3146
3147 #ifdef CONFIG_TMPFS_QUOTA
shmem_get_inode(struct mnt_idmap * idmap,struct super_block * sb,struct inode * dir,umode_t mode,dev_t dev,unsigned long flags)3148 static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
3149 struct super_block *sb, struct inode *dir,
3150 umode_t mode, dev_t dev, unsigned long flags)
3151 {
3152 int err;
3153 struct inode *inode;
3154
3155 inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
3156 if (IS_ERR(inode))
3157 return inode;
3158
3159 err = dquot_initialize(inode);
3160 if (err)
3161 goto errout;
3162
3163 err = dquot_alloc_inode(inode);
3164 if (err) {
3165 dquot_drop(inode);
3166 goto errout;
3167 }
3168 return inode;
3169
3170 errout:
3171 inode->i_flags |= S_NOQUOTA;
3172 iput(inode);
3173 return ERR_PTR(err);
3174 }
3175 #else
shmem_get_inode(struct mnt_idmap * idmap,struct super_block * sb,struct inode * dir,umode_t mode,dev_t dev,unsigned long flags)3176 static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
3177 struct super_block *sb, struct inode *dir,
3178 umode_t mode, dev_t dev, unsigned long flags)
3179 {
3180 return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
3181 }
3182 #endif /* CONFIG_TMPFS_QUOTA */
3183
3184 #ifdef CONFIG_USERFAULTFD
shmem_mfill_atomic_pte(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,unsigned long src_addr,uffd_flags_t flags,struct folio ** foliop)3185 int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
3186 struct vm_area_struct *dst_vma,
3187 unsigned long dst_addr,
3188 unsigned long src_addr,
3189 uffd_flags_t flags,
3190 struct folio **foliop)
3191 {
3192 struct inode *inode = file_inode(dst_vma->vm_file);
3193 struct shmem_inode_info *info = SHMEM_I(inode);
3194 struct address_space *mapping = inode->i_mapping;
3195 gfp_t gfp = mapping_gfp_mask(mapping);
3196 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
3197 void *page_kaddr;
3198 struct folio *folio;
3199 int ret;
3200 pgoff_t max_off;
3201
3202 if (shmem_inode_acct_blocks(inode, 1)) {
3203 /*
3204 * We may have got a page, returned -ENOENT triggering a retry,
3205 * and now we find ourselves with -ENOMEM. Release the page, to
3206 * avoid a BUG_ON in our caller.
3207 */
3208 if (unlikely(*foliop)) {
3209 folio_put(*foliop);
3210 *foliop = NULL;
3211 }
3212 return -ENOMEM;
3213 }
3214
3215 if (!*foliop) {
3216 ret = -ENOMEM;
3217 folio = shmem_alloc_folio(gfp, 0, info, pgoff);
3218 if (!folio)
3219 goto out_unacct_blocks;
3220
3221 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
3222 page_kaddr = kmap_local_folio(folio, 0);
3223 /*
3224 * The read mmap_lock is held here. Despite the
3225 * mmap_lock being read recursive a deadlock is still
3226 * possible if a writer has taken a lock. For example:
3227 *
3228 * process A thread 1 takes read lock on own mmap_lock
3229 * process A thread 2 calls mmap, blocks taking write lock
3230 * process B thread 1 takes page fault, read lock on own mmap lock
3231 * process B thread 2 calls mmap, blocks taking write lock
3232 * process A thread 1 blocks taking read lock on process B
3233 * process B thread 1 blocks taking read lock on process A
3234 *
3235 * Disable page faults to prevent potential deadlock
3236 * and retry the copy outside the mmap_lock.
3237 */
3238 pagefault_disable();
3239 ret = copy_from_user(page_kaddr,
3240 (const void __user *)src_addr,
3241 PAGE_SIZE);
3242 pagefault_enable();
3243 kunmap_local(page_kaddr);
3244
3245 /* fallback to copy_from_user outside mmap_lock */
3246 if (unlikely(ret)) {
3247 *foliop = folio;
3248 ret = -ENOENT;
3249 /* don't free the page */
3250 goto out_unacct_blocks;
3251 }
3252
3253 flush_dcache_folio(folio);
3254 } else { /* ZEROPAGE */
3255 clear_user_highpage(&folio->page, dst_addr);
3256 }
3257 } else {
3258 folio = *foliop;
3259 VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
3260 *foliop = NULL;
3261 }
3262
3263 VM_BUG_ON(folio_test_locked(folio));
3264 VM_BUG_ON(folio_test_swapbacked(folio));
3265 __folio_set_locked(folio);
3266 __folio_set_swapbacked(folio);
3267 __folio_mark_uptodate(folio);
3268
3269 ret = -EFAULT;
3270 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
3271 if (unlikely(pgoff >= max_off))
3272 goto out_release;
3273
3274 ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
3275 if (ret)
3276 goto out_release;
3277 ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
3278 if (ret)
3279 goto out_release;
3280
3281 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
3282 &folio->page, true, flags);
3283 if (ret)
3284 goto out_delete_from_cache;
3285
3286 shmem_recalc_inode(inode, 1, 0);
3287 folio_unlock(folio);
3288 return 0;
3289 out_delete_from_cache:
3290 filemap_remove_folio(folio);
3291 out_release:
3292 folio_unlock(folio);
3293 folio_put(folio);
3294 out_unacct_blocks:
3295 shmem_inode_unacct_blocks(inode, 1);
3296 return ret;
3297 }
3298 #endif /* CONFIG_USERFAULTFD */
3299
3300 #ifdef CONFIG_TMPFS
3301 static const struct inode_operations shmem_symlink_inode_operations;
3302 static const struct inode_operations shmem_short_symlink_operations;
3303
3304 static int
shmem_write_begin(const struct kiocb * iocb,struct address_space * mapping,loff_t pos,unsigned len,struct folio ** foliop,void ** fsdata)3305 shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping,
3306 loff_t pos, unsigned len,
3307 struct folio **foliop, void **fsdata)
3308 {
3309 struct inode *inode = mapping->host;
3310 struct shmem_inode_info *info = SHMEM_I(inode);
3311 pgoff_t index = pos >> PAGE_SHIFT;
3312 struct folio *folio;
3313 int ret = 0;
3314
3315 /* i_rwsem is held by caller */
3316 if (unlikely(info->seals & (F_SEAL_GROW |
3317 F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
3318 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
3319 return -EPERM;
3320 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
3321 return -EPERM;
3322 }
3323
3324 ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
3325 if (ret)
3326 return ret;
3327
3328 if (folio_contain_hwpoisoned_page(folio)) {
3329 folio_unlock(folio);
3330 folio_put(folio);
3331 return -EIO;
3332 }
3333
3334 *foliop = folio;
3335 return 0;
3336 }
3337
3338 static int
shmem_write_end(const struct kiocb * iocb,struct address_space * mapping,loff_t pos,unsigned len,unsigned copied,struct folio * folio,void * fsdata)3339 shmem_write_end(const struct kiocb *iocb, struct address_space *mapping,
3340 loff_t pos, unsigned len, unsigned copied,
3341 struct folio *folio, void *fsdata)
3342 {
3343 struct inode *inode = mapping->host;
3344
3345 if (pos + copied > inode->i_size)
3346 i_size_write(inode, pos + copied);
3347
3348 if (!folio_test_uptodate(folio)) {
3349 if (copied < folio_size(folio)) {
3350 size_t from = offset_in_folio(folio, pos);
3351 folio_zero_segments(folio, 0, from,
3352 from + copied, folio_size(folio));
3353 }
3354 folio_mark_uptodate(folio);
3355 }
3356 folio_mark_dirty(folio);
3357 folio_unlock(folio);
3358 folio_put(folio);
3359
3360 return copied;
3361 }
3362
shmem_file_read_iter(struct kiocb * iocb,struct iov_iter * to)3363 static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3364 {
3365 struct file *file = iocb->ki_filp;
3366 struct inode *inode = file_inode(file);
3367 struct address_space *mapping = inode->i_mapping;
3368 pgoff_t index;
3369 unsigned long offset;
3370 int error = 0;
3371 ssize_t retval = 0;
3372
3373 for (;;) {
3374 struct folio *folio = NULL;
3375 struct page *page = NULL;
3376 unsigned long nr, ret;
3377 loff_t end_offset, i_size = i_size_read(inode);
3378 bool fallback_page_copy = false;
3379 size_t fsize;
3380
3381 if (unlikely(iocb->ki_pos >= i_size))
3382 break;
3383
3384 index = iocb->ki_pos >> PAGE_SHIFT;
3385 error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
3386 if (error) {
3387 if (error == -EINVAL)
3388 error = 0;
3389 break;
3390 }
3391 if (folio) {
3392 folio_unlock(folio);
3393
3394 page = folio_file_page(folio, index);
3395 if (PageHWPoison(page)) {
3396 folio_put(folio);
3397 error = -EIO;
3398 break;
3399 }
3400
3401 if (folio_test_large(folio) &&
3402 folio_test_has_hwpoisoned(folio))
3403 fallback_page_copy = true;
3404 }
3405
3406 /*
3407 * We must evaluate after, since reads (unlike writes)
3408 * are called without i_rwsem protection against truncate
3409 */
3410 i_size = i_size_read(inode);
3411 if (unlikely(iocb->ki_pos >= i_size)) {
3412 if (folio)
3413 folio_put(folio);
3414 break;
3415 }
3416 end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count);
3417 if (folio && likely(!fallback_page_copy))
3418 fsize = folio_size(folio);
3419 else
3420 fsize = PAGE_SIZE;
3421 offset = iocb->ki_pos & (fsize - 1);
3422 nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset);
3423
3424 if (folio) {
3425 /*
3426 * If users can be writing to this page using arbitrary
3427 * virtual addresses, take care about potential aliasing
3428 * before reading the page on the kernel side.
3429 */
3430 if (mapping_writably_mapped(mapping)) {
3431 if (likely(!fallback_page_copy))
3432 flush_dcache_folio(folio);
3433 else
3434 flush_dcache_page(page);
3435 }
3436
3437 /*
3438 * Mark the folio accessed if we read the beginning.
3439 */
3440 if (!offset)
3441 folio_mark_accessed(folio);
3442 /*
3443 * Ok, we have the page, and it's up-to-date, so
3444 * now we can copy it to user space...
3445 */
3446 if (likely(!fallback_page_copy))
3447 ret = copy_folio_to_iter(folio, offset, nr, to);
3448 else
3449 ret = copy_page_to_iter(page, offset, nr, to);
3450 folio_put(folio);
3451 } else if (user_backed_iter(to)) {
3452 /*
3453 * Copy to user tends to be so well optimized, but
3454 * clear_user() not so much, that it is noticeably
3455 * faster to copy the zero page instead of clearing.
3456 */
3457 ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
3458 } else {
3459 /*
3460 * But submitting the same page twice in a row to
3461 * splice() - or others? - can result in confusion:
3462 * so don't attempt that optimization on pipes etc.
3463 */
3464 ret = iov_iter_zero(nr, to);
3465 }
3466
3467 retval += ret;
3468 iocb->ki_pos += ret;
3469
3470 if (!iov_iter_count(to))
3471 break;
3472 if (ret < nr) {
3473 error = -EFAULT;
3474 break;
3475 }
3476 cond_resched();
3477 }
3478
3479 file_accessed(file);
3480 return retval ? retval : error;
3481 }
3482
shmem_file_write_iter(struct kiocb * iocb,struct iov_iter * from)3483 static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
3484 {
3485 struct file *file = iocb->ki_filp;
3486 struct inode *inode = file->f_mapping->host;
3487 ssize_t ret;
3488
3489 inode_lock(inode);
3490 ret = generic_write_checks(iocb, from);
3491 if (ret <= 0)
3492 goto unlock;
3493 ret = file_remove_privs(file);
3494 if (ret)
3495 goto unlock;
3496 ret = file_update_time(file);
3497 if (ret)
3498 goto unlock;
3499 ret = generic_perform_write(iocb, from);
3500 unlock:
3501 inode_unlock(inode);
3502 return ret;
3503 }
3504
zero_pipe_buf_get(struct pipe_inode_info * pipe,struct pipe_buffer * buf)3505 static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
3506 struct pipe_buffer *buf)
3507 {
3508 return true;
3509 }
3510
zero_pipe_buf_release(struct pipe_inode_info * pipe,struct pipe_buffer * buf)3511 static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
3512 struct pipe_buffer *buf)
3513 {
3514 }
3515
zero_pipe_buf_try_steal(struct pipe_inode_info * pipe,struct pipe_buffer * buf)3516 static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
3517 struct pipe_buffer *buf)
3518 {
3519 return false;
3520 }
3521
3522 static const struct pipe_buf_operations zero_pipe_buf_ops = {
3523 .release = zero_pipe_buf_release,
3524 .try_steal = zero_pipe_buf_try_steal,
3525 .get = zero_pipe_buf_get,
3526 };
3527
splice_zeropage_into_pipe(struct pipe_inode_info * pipe,loff_t fpos,size_t size)3528 static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
3529 loff_t fpos, size_t size)
3530 {
3531 size_t offset = fpos & ~PAGE_MASK;
3532
3533 size = min_t(size_t, size, PAGE_SIZE - offset);
3534
3535 if (!pipe_is_full(pipe)) {
3536 struct pipe_buffer *buf = pipe_head_buf(pipe);
3537
3538 *buf = (struct pipe_buffer) {
3539 .ops = &zero_pipe_buf_ops,
3540 .page = ZERO_PAGE(0),
3541 .offset = offset,
3542 .len = size,
3543 };
3544 pipe->head++;
3545 }
3546
3547 return size;
3548 }
3549
shmem_file_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)3550 static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
3551 struct pipe_inode_info *pipe,
3552 size_t len, unsigned int flags)
3553 {
3554 struct inode *inode = file_inode(in);
3555 struct address_space *mapping = inode->i_mapping;
3556 struct folio *folio = NULL;
3557 size_t total_spliced = 0, used, npages, n, part;
3558 loff_t isize;
3559 int error = 0;
3560
3561 /* Work out how much data we can actually add into the pipe */
3562 used = pipe_buf_usage(pipe);
3563 npages = max_t(ssize_t, pipe->max_usage - used, 0);
3564 len = min_t(size_t, len, npages * PAGE_SIZE);
3565
3566 do {
3567 bool fallback_page_splice = false;
3568 struct page *page = NULL;
3569 pgoff_t index;
3570 size_t size;
3571
3572 if (*ppos >= i_size_read(inode))
3573 break;
3574
3575 index = *ppos >> PAGE_SHIFT;
3576 error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
3577 if (error) {
3578 if (error == -EINVAL)
3579 error = 0;
3580 break;
3581 }
3582 if (folio) {
3583 folio_unlock(folio);
3584
3585 page = folio_file_page(folio, index);
3586 if (PageHWPoison(page)) {
3587 error = -EIO;
3588 break;
3589 }
3590
3591 if (folio_test_large(folio) &&
3592 folio_test_has_hwpoisoned(folio))
3593 fallback_page_splice = true;
3594 }
3595
3596 /*
3597 * i_size must be checked after we know the pages are Uptodate.
3598 *
3599 * Checking i_size after the check allows us to calculate
3600 * the correct value for "nr", which means the zero-filled
3601 * part of the page is not copied back to userspace (unless
3602 * another truncate extends the file - this is desired though).
3603 */
3604 isize = i_size_read(inode);
3605 if (unlikely(*ppos >= isize))
3606 break;
3607 /*
3608 * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned
3609 * pages.
3610 */
3611 size = len;
3612 if (unlikely(fallback_page_splice)) {
3613 size_t offset = *ppos & ~PAGE_MASK;
3614
3615 size = umin(size, PAGE_SIZE - offset);
3616 }
3617 part = min_t(loff_t, isize - *ppos, size);
3618
3619 if (folio) {
3620 /*
3621 * If users can be writing to this page using arbitrary
3622 * virtual addresses, take care about potential aliasing
3623 * before reading the page on the kernel side.
3624 */
3625 if (mapping_writably_mapped(mapping)) {
3626 if (likely(!fallback_page_splice))
3627 flush_dcache_folio(folio);
3628 else
3629 flush_dcache_page(page);
3630 }
3631 folio_mark_accessed(folio);
3632 /*
3633 * Ok, we have the page, and it's up-to-date, so we can
3634 * now splice it into the pipe.
3635 */
3636 n = splice_folio_into_pipe(pipe, folio, *ppos, part);
3637 folio_put(folio);
3638 folio = NULL;
3639 } else {
3640 n = splice_zeropage_into_pipe(pipe, *ppos, part);
3641 }
3642
3643 if (!n)
3644 break;
3645 len -= n;
3646 total_spliced += n;
3647 *ppos += n;
3648 in->f_ra.prev_pos = *ppos;
3649 if (pipe_is_full(pipe))
3650 break;
3651
3652 cond_resched();
3653 } while (len);
3654
3655 if (folio)
3656 folio_put(folio);
3657
3658 file_accessed(in);
3659 return total_spliced ? total_spliced : error;
3660 }
3661
shmem_file_llseek(struct file * file,loff_t offset,int whence)3662 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
3663 {
3664 struct address_space *mapping = file->f_mapping;
3665 struct inode *inode = mapping->host;
3666
3667 if (whence != SEEK_DATA && whence != SEEK_HOLE)
3668 return generic_file_llseek_size(file, offset, whence,
3669 MAX_LFS_FILESIZE, i_size_read(inode));
3670 if (offset < 0)
3671 return -ENXIO;
3672
3673 inode_lock(inode);
3674 /* We're holding i_rwsem so we can access i_size directly */
3675 offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
3676 if (offset >= 0)
3677 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
3678 inode_unlock(inode);
3679 return offset;
3680 }
3681
shmem_fallocate(struct file * file,int mode,loff_t offset,loff_t len)3682 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
3683 loff_t len)
3684 {
3685 struct inode *inode = file_inode(file);
3686 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
3687 struct shmem_inode_info *info = SHMEM_I(inode);
3688 struct shmem_falloc shmem_falloc;
3689 pgoff_t start, index, end, undo_fallocend;
3690 int error;
3691
3692 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
3693 return -EOPNOTSUPP;
3694
3695 inode_lock(inode);
3696
3697 if (mode & FALLOC_FL_PUNCH_HOLE) {
3698 struct address_space *mapping = file->f_mapping;
3699 loff_t unmap_start = round_up(offset, PAGE_SIZE);
3700 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
3701 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
3702
3703 /* protected by i_rwsem */
3704 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
3705 error = -EPERM;
3706 goto out;
3707 }
3708
3709 shmem_falloc.waitq = &shmem_falloc_waitq;
3710 shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
3711 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
3712 spin_lock(&inode->i_lock);
3713 inode->i_private = &shmem_falloc;
3714 spin_unlock(&inode->i_lock);
3715
3716 if ((u64)unmap_end > (u64)unmap_start)
3717 unmap_mapping_range(mapping, unmap_start,
3718 1 + unmap_end - unmap_start, 0);
3719 shmem_truncate_range(inode, offset, offset + len - 1);
3720 /* No need to unmap again: hole-punching leaves COWed pages */
3721
3722 spin_lock(&inode->i_lock);
3723 inode->i_private = NULL;
3724 wake_up_all(&shmem_falloc_waitq);
3725 WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
3726 spin_unlock(&inode->i_lock);
3727 error = 0;
3728 goto out;
3729 }
3730
3731 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
3732 error = inode_newsize_ok(inode, offset + len);
3733 if (error)
3734 goto out;
3735
3736 if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
3737 error = -EPERM;
3738 goto out;
3739 }
3740
3741 start = offset >> PAGE_SHIFT;
3742 end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
3743 /* Try to avoid a swapstorm if len is impossible to satisfy */
3744 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
3745 error = -ENOSPC;
3746 goto out;
3747 }
3748
3749 shmem_falloc.waitq = NULL;
3750 shmem_falloc.start = start;
3751 shmem_falloc.next = start;
3752 shmem_falloc.nr_falloced = 0;
3753 shmem_falloc.nr_unswapped = 0;
3754 spin_lock(&inode->i_lock);
3755 inode->i_private = &shmem_falloc;
3756 spin_unlock(&inode->i_lock);
3757
3758 /*
3759 * info->fallocend is only relevant when huge pages might be
3760 * involved: to prevent split_huge_page() freeing fallocated
3761 * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
3762 */
3763 undo_fallocend = info->fallocend;
3764 if (info->fallocend < end)
3765 info->fallocend = end;
3766
3767 for (index = start; index < end; ) {
3768 struct folio *folio;
3769
3770 /*
3771 * Check for fatal signal so that we abort early in OOM
3772 * situations. We don't want to abort in case of non-fatal
3773 * signals as large fallocate can take noticeable time and
3774 * e.g. periodic timers may result in fallocate constantly
3775 * restarting.
3776 */
3777 if (fatal_signal_pending(current))
3778 error = -EINTR;
3779 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
3780 error = -ENOMEM;
3781 else
3782 error = shmem_get_folio(inode, index, offset + len,
3783 &folio, SGP_FALLOC);
3784 if (error) {
3785 info->fallocend = undo_fallocend;
3786 /* Remove the !uptodate folios we added */
3787 if (index > start) {
3788 shmem_undo_range(inode,
3789 (loff_t)start << PAGE_SHIFT,
3790 ((loff_t)index << PAGE_SHIFT) - 1, true);
3791 }
3792 goto undone;
3793 }
3794
3795 /*
3796 * Here is a more important optimization than it appears:
3797 * a second SGP_FALLOC on the same large folio will clear it,
3798 * making it uptodate and un-undoable if we fail later.
3799 */
3800 index = folio_next_index(folio);
3801 /* Beware 32-bit wraparound */
3802 if (!index)
3803 index--;
3804
3805 /*
3806 * Inform shmem_writeout() how far we have reached.
3807 * No need for lock or barrier: we have the page lock.
3808 */
3809 if (!folio_test_uptodate(folio))
3810 shmem_falloc.nr_falloced += index - shmem_falloc.next;
3811 shmem_falloc.next = index;
3812
3813 /*
3814 * If !uptodate, leave it that way so that freeable folios
3815 * can be recognized if we need to rollback on error later.
3816 * But mark it dirty so that memory pressure will swap rather
3817 * than free the folios we are allocating (and SGP_CACHE folios
3818 * might still be clean: we now need to mark those dirty too).
3819 */
3820 folio_mark_dirty(folio);
3821 folio_unlock(folio);
3822 folio_put(folio);
3823 cond_resched();
3824 }
3825
3826 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
3827 i_size_write(inode, offset + len);
3828 undone:
3829 spin_lock(&inode->i_lock);
3830 inode->i_private = NULL;
3831 spin_unlock(&inode->i_lock);
3832 out:
3833 if (!error)
3834 file_modified(file);
3835 inode_unlock(inode);
3836 return error;
3837 }
3838
shmem_statfs(struct dentry * dentry,struct kstatfs * buf)3839 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
3840 {
3841 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
3842
3843 buf->f_type = TMPFS_MAGIC;
3844 buf->f_bsize = PAGE_SIZE;
3845 buf->f_namelen = NAME_MAX;
3846 if (sbinfo->max_blocks) {
3847 buf->f_blocks = sbinfo->max_blocks;
3848 buf->f_bavail =
3849 buf->f_bfree = sbinfo->max_blocks -
3850 percpu_counter_sum(&sbinfo->used_blocks);
3851 }
3852 if (sbinfo->max_inodes) {
3853 buf->f_files = sbinfo->max_inodes;
3854 buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
3855 }
3856 /* else leave those fields 0 like simple_statfs */
3857
3858 buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
3859
3860 return 0;
3861 }
3862
3863 /*
3864 * File creation. Allocate an inode, and we're done..
3865 */
3866 static int
shmem_mknod(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,dev_t dev)3867 shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
3868 struct dentry *dentry, umode_t mode, dev_t dev)
3869 {
3870 struct inode *inode;
3871 int error;
3872
3873 if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
3874 return -EINVAL;
3875
3876 inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
3877 if (IS_ERR(inode))
3878 return PTR_ERR(inode);
3879
3880 error = simple_acl_create(dir, inode);
3881 if (error)
3882 goto out_iput;
3883 error = security_inode_init_security(inode, dir, &dentry->d_name,
3884 shmem_initxattrs, NULL);
3885 if (error && error != -EOPNOTSUPP)
3886 goto out_iput;
3887
3888 error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
3889 if (error)
3890 goto out_iput;
3891
3892 dir->i_size += BOGO_DIRENT_SIZE;
3893 inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
3894 inode_inc_iversion(dir);
3895
3896 if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
3897 d_add(dentry, inode);
3898 else
3899 d_instantiate(dentry, inode);
3900
3901 dget(dentry); /* Extra count - pin the dentry in core */
3902 return error;
3903
3904 out_iput:
3905 iput(inode);
3906 return error;
3907 }
3908
3909 static int
shmem_tmpfile(struct mnt_idmap * idmap,struct inode * dir,struct file * file,umode_t mode)3910 shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
3911 struct file *file, umode_t mode)
3912 {
3913 struct inode *inode;
3914 int error;
3915
3916 inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
3917 if (IS_ERR(inode)) {
3918 error = PTR_ERR(inode);
3919 goto err_out;
3920 }
3921 error = security_inode_init_security(inode, dir, NULL,
3922 shmem_initxattrs, NULL);
3923 if (error && error != -EOPNOTSUPP)
3924 goto out_iput;
3925 error = simple_acl_create(dir, inode);
3926 if (error)
3927 goto out_iput;
3928 d_tmpfile(file, inode);
3929
3930 err_out:
3931 return finish_open_simple(file, error);
3932 out_iput:
3933 iput(inode);
3934 return error;
3935 }
3936
shmem_mkdir(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode)3937 static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
3938 struct dentry *dentry, umode_t mode)
3939 {
3940 int error;
3941
3942 error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
3943 if (error)
3944 return ERR_PTR(error);
3945 inc_nlink(dir);
3946 return NULL;
3947 }
3948
shmem_create(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,bool excl)3949 static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
3950 struct dentry *dentry, umode_t mode, bool excl)
3951 {
3952 return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
3953 }
3954
3955 /*
3956 * Link a file..
3957 */
shmem_link(struct dentry * old_dentry,struct inode * dir,struct dentry * dentry)3958 static int shmem_link(struct dentry *old_dentry, struct inode *dir,
3959 struct dentry *dentry)
3960 {
3961 struct inode *inode = d_inode(old_dentry);
3962 int ret = 0;
3963
3964 /*
3965 * No ordinary (disk based) filesystem counts links as inodes;
3966 * but each new link needs a new dentry, pinning lowmem, and
3967 * tmpfs dentries cannot be pruned until they are unlinked.
3968 * But if an O_TMPFILE file is linked into the tmpfs, the
3969 * first link must skip that, to get the accounting right.
3970 */
3971 if (inode->i_nlink) {
3972 ret = shmem_reserve_inode(inode->i_sb, NULL);
3973 if (ret)
3974 goto out;
3975 }
3976
3977 ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
3978 if (ret) {
3979 if (inode->i_nlink)
3980 shmem_free_inode(inode->i_sb, 0);
3981 goto out;
3982 }
3983
3984 dir->i_size += BOGO_DIRENT_SIZE;
3985 inode_set_mtime_to_ts(dir,
3986 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
3987 inode_inc_iversion(dir);
3988 inc_nlink(inode);
3989 ihold(inode); /* New dentry reference */
3990 dget(dentry); /* Extra pinning count for the created dentry */
3991 if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
3992 d_add(dentry, inode);
3993 else
3994 d_instantiate(dentry, inode);
3995 out:
3996 return ret;
3997 }
3998
shmem_unlink(struct inode * dir,struct dentry * dentry)3999 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
4000 {
4001 struct inode *inode = d_inode(dentry);
4002
4003 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
4004 shmem_free_inode(inode->i_sb, 0);
4005
4006 simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
4007
4008 dir->i_size -= BOGO_DIRENT_SIZE;
4009 inode_set_mtime_to_ts(dir,
4010 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
4011 inode_inc_iversion(dir);
4012 drop_nlink(inode);
4013 dput(dentry); /* Undo the count from "create" - does all the work */
4014
4015 /*
4016 * For now, VFS can't deal with case-insensitive negative dentries, so
4017 * we invalidate them
4018 */
4019 if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
4020 d_invalidate(dentry);
4021
4022 return 0;
4023 }
4024
shmem_rmdir(struct inode * dir,struct dentry * dentry)4025 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
4026 {
4027 if (!simple_empty(dentry))
4028 return -ENOTEMPTY;
4029
4030 drop_nlink(d_inode(dentry));
4031 drop_nlink(dir);
4032 return shmem_unlink(dir, dentry);
4033 }
4034
shmem_whiteout(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry)4035 static int shmem_whiteout(struct mnt_idmap *idmap,
4036 struct inode *old_dir, struct dentry *old_dentry)
4037 {
4038 struct dentry *whiteout;
4039 int error;
4040
4041 whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
4042 if (!whiteout)
4043 return -ENOMEM;
4044
4045 error = shmem_mknod(idmap, old_dir, whiteout,
4046 S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
4047 dput(whiteout);
4048 if (error)
4049 return error;
4050
4051 /*
4052 * Cheat and hash the whiteout while the old dentry is still in
4053 * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
4054 *
4055 * d_lookup() will consistently find one of them at this point,
4056 * not sure which one, but that isn't even important.
4057 */
4058 d_rehash(whiteout);
4059 return 0;
4060 }
4061
4062 /*
4063 * The VFS layer already does all the dentry stuff for rename,
4064 * we just have to decrement the usage count for the target if
4065 * it exists so that the VFS layer correctly free's it when it
4066 * gets overwritten.
4067 */
shmem_rename2(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)4068 static int shmem_rename2(struct mnt_idmap *idmap,
4069 struct inode *old_dir, struct dentry *old_dentry,
4070 struct inode *new_dir, struct dentry *new_dentry,
4071 unsigned int flags)
4072 {
4073 struct inode *inode = d_inode(old_dentry);
4074 int they_are_dirs = S_ISDIR(inode->i_mode);
4075 int error;
4076
4077 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4078 return -EINVAL;
4079
4080 if (flags & RENAME_EXCHANGE)
4081 return simple_offset_rename_exchange(old_dir, old_dentry,
4082 new_dir, new_dentry);
4083
4084 if (!simple_empty(new_dentry))
4085 return -ENOTEMPTY;
4086
4087 if (flags & RENAME_WHITEOUT) {
4088 error = shmem_whiteout(idmap, old_dir, old_dentry);
4089 if (error)
4090 return error;
4091 }
4092
4093 error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
4094 if (error)
4095 return error;
4096
4097 if (d_really_is_positive(new_dentry)) {
4098 (void) shmem_unlink(new_dir, new_dentry);
4099 if (they_are_dirs) {
4100 drop_nlink(d_inode(new_dentry));
4101 drop_nlink(old_dir);
4102 }
4103 } else if (they_are_dirs) {
4104 drop_nlink(old_dir);
4105 inc_nlink(new_dir);
4106 }
4107
4108 old_dir->i_size -= BOGO_DIRENT_SIZE;
4109 new_dir->i_size += BOGO_DIRENT_SIZE;
4110 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
4111 inode_inc_iversion(old_dir);
4112 inode_inc_iversion(new_dir);
4113 return 0;
4114 }
4115
shmem_symlink(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,const char * symname)4116 static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
4117 struct dentry *dentry, const char *symname)
4118 {
4119 int error;
4120 int len;
4121 struct inode *inode;
4122 struct folio *folio;
4123 char *link;
4124
4125 len = strlen(symname) + 1;
4126 if (len > PAGE_SIZE)
4127 return -ENAMETOOLONG;
4128
4129 inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
4130 VM_NORESERVE);
4131 if (IS_ERR(inode))
4132 return PTR_ERR(inode);
4133
4134 error = security_inode_init_security(inode, dir, &dentry->d_name,
4135 shmem_initxattrs, NULL);
4136 if (error && error != -EOPNOTSUPP)
4137 goto out_iput;
4138
4139 error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
4140 if (error)
4141 goto out_iput;
4142
4143 inode->i_size = len-1;
4144 if (len <= SHORT_SYMLINK_LEN) {
4145 link = kmemdup(symname, len, GFP_KERNEL);
4146 if (!link) {
4147 error = -ENOMEM;
4148 goto out_remove_offset;
4149 }
4150 inode->i_op = &shmem_short_symlink_operations;
4151 inode_set_cached_link(inode, link, len - 1);
4152 } else {
4153 inode_nohighmem(inode);
4154 inode->i_mapping->a_ops = &shmem_aops;
4155 error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE);
4156 if (error)
4157 goto out_remove_offset;
4158 inode->i_op = &shmem_symlink_inode_operations;
4159 memcpy(folio_address(folio), symname, len);
4160 folio_mark_uptodate(folio);
4161 folio_mark_dirty(folio);
4162 folio_unlock(folio);
4163 folio_put(folio);
4164 }
4165 dir->i_size += BOGO_DIRENT_SIZE;
4166 inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
4167 inode_inc_iversion(dir);
4168 if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
4169 d_add(dentry, inode);
4170 else
4171 d_instantiate(dentry, inode);
4172 dget(dentry);
4173 return 0;
4174
4175 out_remove_offset:
4176 simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
4177 out_iput:
4178 iput(inode);
4179 return error;
4180 }
4181
shmem_put_link(void * arg)4182 static void shmem_put_link(void *arg)
4183 {
4184 folio_mark_accessed(arg);
4185 folio_put(arg);
4186 }
4187
shmem_get_link(struct dentry * dentry,struct inode * inode,struct delayed_call * done)4188 static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
4189 struct delayed_call *done)
4190 {
4191 struct folio *folio = NULL;
4192 int error;
4193
4194 if (!dentry) {
4195 folio = filemap_get_folio(inode->i_mapping, 0);
4196 if (IS_ERR(folio))
4197 return ERR_PTR(-ECHILD);
4198 if (PageHWPoison(folio_page(folio, 0)) ||
4199 !folio_test_uptodate(folio)) {
4200 folio_put(folio);
4201 return ERR_PTR(-ECHILD);
4202 }
4203 } else {
4204 error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ);
4205 if (error)
4206 return ERR_PTR(error);
4207 if (!folio)
4208 return ERR_PTR(-ECHILD);
4209 if (PageHWPoison(folio_page(folio, 0))) {
4210 folio_unlock(folio);
4211 folio_put(folio);
4212 return ERR_PTR(-ECHILD);
4213 }
4214 folio_unlock(folio);
4215 }
4216 set_delayed_call(done, shmem_put_link, folio);
4217 return folio_address(folio);
4218 }
4219
4220 #ifdef CONFIG_TMPFS_XATTR
4221
shmem_fileattr_get(struct dentry * dentry,struct file_kattr * fa)4222 static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
4223 {
4224 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
4225
4226 fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
4227
4228 return 0;
4229 }
4230
shmem_fileattr_set(struct mnt_idmap * idmap,struct dentry * dentry,struct file_kattr * fa)4231 static int shmem_fileattr_set(struct mnt_idmap *idmap,
4232 struct dentry *dentry, struct file_kattr *fa)
4233 {
4234 struct inode *inode = d_inode(dentry);
4235 struct shmem_inode_info *info = SHMEM_I(inode);
4236 int ret, flags;
4237
4238 if (fileattr_has_fsx(fa))
4239 return -EOPNOTSUPP;
4240 if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
4241 return -EOPNOTSUPP;
4242
4243 flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
4244 (fa->flags & SHMEM_FL_USER_MODIFIABLE);
4245
4246 ret = shmem_set_inode_flags(inode, flags, dentry);
4247
4248 if (ret)
4249 return ret;
4250
4251 info->fsflags = flags;
4252
4253 inode_set_ctime_current(inode);
4254 inode_inc_iversion(inode);
4255 return 0;
4256 }
4257
4258 /*
4259 * Superblocks without xattr inode operations may get some security.* xattr
4260 * support from the LSM "for free". As soon as we have any other xattrs
4261 * like ACLs, we also need to implement the security.* handlers at
4262 * filesystem level, though.
4263 */
4264
4265 /*
4266 * Callback for security_inode_init_security() for acquiring xattrs.
4267 */
shmem_initxattrs(struct inode * inode,const struct xattr * xattr_array,void * fs_info)4268 static int shmem_initxattrs(struct inode *inode,
4269 const struct xattr *xattr_array, void *fs_info)
4270 {
4271 struct shmem_inode_info *info = SHMEM_I(inode);
4272 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
4273 const struct xattr *xattr;
4274 struct simple_xattr *new_xattr;
4275 size_t ispace = 0;
4276 size_t len;
4277
4278 if (sbinfo->max_inodes) {
4279 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
4280 ispace += simple_xattr_space(xattr->name,
4281 xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
4282 }
4283 if (ispace) {
4284 raw_spin_lock(&sbinfo->stat_lock);
4285 if (sbinfo->free_ispace < ispace)
4286 ispace = 0;
4287 else
4288 sbinfo->free_ispace -= ispace;
4289 raw_spin_unlock(&sbinfo->stat_lock);
4290 if (!ispace)
4291 return -ENOSPC;
4292 }
4293 }
4294
4295 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
4296 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
4297 if (!new_xattr)
4298 break;
4299
4300 len = strlen(xattr->name) + 1;
4301 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
4302 GFP_KERNEL_ACCOUNT);
4303 if (!new_xattr->name) {
4304 kvfree(new_xattr);
4305 break;
4306 }
4307
4308 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
4309 XATTR_SECURITY_PREFIX_LEN);
4310 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
4311 xattr->name, len);
4312
4313 simple_xattr_add(&info->xattrs, new_xattr);
4314 }
4315
4316 if (xattr->name != NULL) {
4317 if (ispace) {
4318 raw_spin_lock(&sbinfo->stat_lock);
4319 sbinfo->free_ispace += ispace;
4320 raw_spin_unlock(&sbinfo->stat_lock);
4321 }
4322 simple_xattrs_free(&info->xattrs, NULL);
4323 return -ENOMEM;
4324 }
4325
4326 return 0;
4327 }
4328
shmem_xattr_handler_get(const struct xattr_handler * handler,struct dentry * unused,struct inode * inode,const char * name,void * buffer,size_t size)4329 static int shmem_xattr_handler_get(const struct xattr_handler *handler,
4330 struct dentry *unused, struct inode *inode,
4331 const char *name, void *buffer, size_t size)
4332 {
4333 struct shmem_inode_info *info = SHMEM_I(inode);
4334
4335 name = xattr_full_name(handler, name);
4336 return simple_xattr_get(&info->xattrs, name, buffer, size);
4337 }
4338
shmem_xattr_handler_set(const struct xattr_handler * handler,struct mnt_idmap * idmap,struct dentry * unused,struct inode * inode,const char * name,const void * value,size_t size,int flags)4339 static int shmem_xattr_handler_set(const struct xattr_handler *handler,
4340 struct mnt_idmap *idmap,
4341 struct dentry *unused, struct inode *inode,
4342 const char *name, const void *value,
4343 size_t size, int flags)
4344 {
4345 struct shmem_inode_info *info = SHMEM_I(inode);
4346 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
4347 struct simple_xattr *old_xattr;
4348 size_t ispace = 0;
4349
4350 name = xattr_full_name(handler, name);
4351 if (value && sbinfo->max_inodes) {
4352 ispace = simple_xattr_space(name, size);
4353 raw_spin_lock(&sbinfo->stat_lock);
4354 if (sbinfo->free_ispace < ispace)
4355 ispace = 0;
4356 else
4357 sbinfo->free_ispace -= ispace;
4358 raw_spin_unlock(&sbinfo->stat_lock);
4359 if (!ispace)
4360 return -ENOSPC;
4361 }
4362
4363 old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags);
4364 if (!IS_ERR(old_xattr)) {
4365 ispace = 0;
4366 if (old_xattr && sbinfo->max_inodes)
4367 ispace = simple_xattr_space(old_xattr->name,
4368 old_xattr->size);
4369 simple_xattr_free(old_xattr);
4370 old_xattr = NULL;
4371 inode_set_ctime_current(inode);
4372 inode_inc_iversion(inode);
4373 }
4374 if (ispace) {
4375 raw_spin_lock(&sbinfo->stat_lock);
4376 sbinfo->free_ispace += ispace;
4377 raw_spin_unlock(&sbinfo->stat_lock);
4378 }
4379 return PTR_ERR(old_xattr);
4380 }
4381
4382 static const struct xattr_handler shmem_security_xattr_handler = {
4383 .prefix = XATTR_SECURITY_PREFIX,
4384 .get = shmem_xattr_handler_get,
4385 .set = shmem_xattr_handler_set,
4386 };
4387
4388 static const struct xattr_handler shmem_trusted_xattr_handler = {
4389 .prefix = XATTR_TRUSTED_PREFIX,
4390 .get = shmem_xattr_handler_get,
4391 .set = shmem_xattr_handler_set,
4392 };
4393
4394 static const struct xattr_handler shmem_user_xattr_handler = {
4395 .prefix = XATTR_USER_PREFIX,
4396 .get = shmem_xattr_handler_get,
4397 .set = shmem_xattr_handler_set,
4398 };
4399
4400 static const struct xattr_handler * const shmem_xattr_handlers[] = {
4401 &shmem_security_xattr_handler,
4402 &shmem_trusted_xattr_handler,
4403 &shmem_user_xattr_handler,
4404 NULL
4405 };
4406
shmem_listxattr(struct dentry * dentry,char * buffer,size_t size)4407 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
4408 {
4409 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
4410 return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
4411 }
4412 #endif /* CONFIG_TMPFS_XATTR */
4413
4414 static const struct inode_operations shmem_short_symlink_operations = {
4415 .getattr = shmem_getattr,
4416 .setattr = shmem_setattr,
4417 .get_link = simple_get_link,
4418 #ifdef CONFIG_TMPFS_XATTR
4419 .listxattr = shmem_listxattr,
4420 #endif
4421 };
4422
4423 static const struct inode_operations shmem_symlink_inode_operations = {
4424 .getattr = shmem_getattr,
4425 .setattr = shmem_setattr,
4426 .get_link = shmem_get_link,
4427 #ifdef CONFIG_TMPFS_XATTR
4428 .listxattr = shmem_listxattr,
4429 #endif
4430 };
4431
shmem_get_parent(struct dentry * child)4432 static struct dentry *shmem_get_parent(struct dentry *child)
4433 {
4434 return ERR_PTR(-ESTALE);
4435 }
4436
shmem_match(struct inode * ino,void * vfh)4437 static int shmem_match(struct inode *ino, void *vfh)
4438 {
4439 __u32 *fh = vfh;
4440 __u64 inum = fh[2];
4441 inum = (inum << 32) | fh[1];
4442 return ino->i_ino == inum && fh[0] == ino->i_generation;
4443 }
4444
4445 /* Find any alias of inode, but prefer a hashed alias */
shmem_find_alias(struct inode * inode)4446 static struct dentry *shmem_find_alias(struct inode *inode)
4447 {
4448 struct dentry *alias = d_find_alias(inode);
4449
4450 return alias ?: d_find_any_alias(inode);
4451 }
4452
shmem_fh_to_dentry(struct super_block * sb,struct fid * fid,int fh_len,int fh_type)4453 static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
4454 struct fid *fid, int fh_len, int fh_type)
4455 {
4456 struct inode *inode;
4457 struct dentry *dentry = NULL;
4458 u64 inum;
4459
4460 if (fh_len < 3)
4461 return NULL;
4462
4463 inum = fid->raw[2];
4464 inum = (inum << 32) | fid->raw[1];
4465
4466 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
4467 shmem_match, fid->raw);
4468 if (inode) {
4469 dentry = shmem_find_alias(inode);
4470 iput(inode);
4471 }
4472
4473 return dentry;
4474 }
4475
shmem_encode_fh(struct inode * inode,__u32 * fh,int * len,struct inode * parent)4476 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
4477 struct inode *parent)
4478 {
4479 if (*len < 3) {
4480 *len = 3;
4481 return FILEID_INVALID;
4482 }
4483
4484 if (inode_unhashed(inode)) {
4485 /* Unfortunately insert_inode_hash is not idempotent,
4486 * so as we hash inodes here rather than at creation
4487 * time, we need a lock to ensure we only try
4488 * to do it once
4489 */
4490 static DEFINE_SPINLOCK(lock);
4491 spin_lock(&lock);
4492 if (inode_unhashed(inode))
4493 __insert_inode_hash(inode,
4494 inode->i_ino + inode->i_generation);
4495 spin_unlock(&lock);
4496 }
4497
4498 fh[0] = inode->i_generation;
4499 fh[1] = inode->i_ino;
4500 fh[2] = ((__u64)inode->i_ino) >> 32;
4501
4502 *len = 3;
4503 return 1;
4504 }
4505
4506 static const struct export_operations shmem_export_ops = {
4507 .get_parent = shmem_get_parent,
4508 .encode_fh = shmem_encode_fh,
4509 .fh_to_dentry = shmem_fh_to_dentry,
4510 };
4511
4512 enum shmem_param {
4513 Opt_gid,
4514 Opt_huge,
4515 Opt_mode,
4516 Opt_mpol,
4517 Opt_nr_blocks,
4518 Opt_nr_inodes,
4519 Opt_size,
4520 Opt_uid,
4521 Opt_inode32,
4522 Opt_inode64,
4523 Opt_noswap,
4524 Opt_quota,
4525 Opt_usrquota,
4526 Opt_grpquota,
4527 Opt_usrquota_block_hardlimit,
4528 Opt_usrquota_inode_hardlimit,
4529 Opt_grpquota_block_hardlimit,
4530 Opt_grpquota_inode_hardlimit,
4531 Opt_casefold_version,
4532 Opt_casefold,
4533 Opt_strict_encoding,
4534 };
4535
4536 static const struct constant_table shmem_param_enums_huge[] = {
4537 {"never", SHMEM_HUGE_NEVER },
4538 {"always", SHMEM_HUGE_ALWAYS },
4539 {"within_size", SHMEM_HUGE_WITHIN_SIZE },
4540 {"advise", SHMEM_HUGE_ADVISE },
4541 {}
4542 };
4543
4544 const struct fs_parameter_spec shmem_fs_parameters[] = {
4545 fsparam_gid ("gid", Opt_gid),
4546 fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
4547 fsparam_u32oct("mode", Opt_mode),
4548 fsparam_string("mpol", Opt_mpol),
4549 fsparam_string("nr_blocks", Opt_nr_blocks),
4550 fsparam_string("nr_inodes", Opt_nr_inodes),
4551 fsparam_string("size", Opt_size),
4552 fsparam_uid ("uid", Opt_uid),
4553 fsparam_flag ("inode32", Opt_inode32),
4554 fsparam_flag ("inode64", Opt_inode64),
4555 fsparam_flag ("noswap", Opt_noswap),
4556 #ifdef CONFIG_TMPFS_QUOTA
4557 fsparam_flag ("quota", Opt_quota),
4558 fsparam_flag ("usrquota", Opt_usrquota),
4559 fsparam_flag ("grpquota", Opt_grpquota),
4560 fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
4561 fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
4562 fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
4563 fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
4564 #endif
4565 fsparam_string("casefold", Opt_casefold_version),
4566 fsparam_flag ("casefold", Opt_casefold),
4567 fsparam_flag ("strict_encoding", Opt_strict_encoding),
4568 {}
4569 };
4570
4571 #if IS_ENABLED(CONFIG_UNICODE)
shmem_parse_opt_casefold(struct fs_context * fc,struct fs_parameter * param,bool latest_version)4572 static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
4573 bool latest_version)
4574 {
4575 struct shmem_options *ctx = fc->fs_private;
4576 int version = UTF8_LATEST;
4577 struct unicode_map *encoding;
4578 char *version_str = param->string + 5;
4579
4580 if (!latest_version) {
4581 if (strncmp(param->string, "utf8-", 5))
4582 return invalfc(fc, "Only UTF-8 encodings are supported "
4583 "in the format: utf8-<version number>");
4584
4585 version = utf8_parse_version(version_str);
4586 if (version < 0)
4587 return invalfc(fc, "Invalid UTF-8 version: %s", version_str);
4588 }
4589
4590 encoding = utf8_load(version);
4591
4592 if (IS_ERR(encoding)) {
4593 return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n",
4594 unicode_major(version), unicode_minor(version),
4595 unicode_rev(version));
4596 }
4597
4598 pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n",
4599 unicode_major(version), unicode_minor(version), unicode_rev(version));
4600
4601 ctx->encoding = encoding;
4602
4603 return 0;
4604 }
4605 #else
shmem_parse_opt_casefold(struct fs_context * fc,struct fs_parameter * param,bool latest_version)4606 static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
4607 bool latest_version)
4608 {
4609 return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
4610 }
4611 #endif
4612
shmem_parse_one(struct fs_context * fc,struct fs_parameter * param)4613 static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
4614 {
4615 struct shmem_options *ctx = fc->fs_private;
4616 struct fs_parse_result result;
4617 unsigned long long size;
4618 char *rest;
4619 int opt;
4620 kuid_t kuid;
4621 kgid_t kgid;
4622
4623 opt = fs_parse(fc, shmem_fs_parameters, param, &result);
4624 if (opt < 0)
4625 return opt;
4626
4627 switch (opt) {
4628 case Opt_size:
4629 size = memparse(param->string, &rest);
4630 if (*rest == '%') {
4631 size <<= PAGE_SHIFT;
4632 size *= totalram_pages();
4633 do_div(size, 100);
4634 rest++;
4635 }
4636 if (*rest)
4637 goto bad_value;
4638 ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
4639 ctx->seen |= SHMEM_SEEN_BLOCKS;
4640 break;
4641 case Opt_nr_blocks:
4642 ctx->blocks = memparse(param->string, &rest);
4643 if (*rest || ctx->blocks > LONG_MAX)
4644 goto bad_value;
4645 ctx->seen |= SHMEM_SEEN_BLOCKS;
4646 break;
4647 case Opt_nr_inodes:
4648 ctx->inodes = memparse(param->string, &rest);
4649 if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
4650 goto bad_value;
4651 ctx->seen |= SHMEM_SEEN_INODES;
4652 break;
4653 case Opt_mode:
4654 ctx->mode = result.uint_32 & 07777;
4655 break;
4656 case Opt_uid:
4657 kuid = result.uid;
4658
4659 /*
4660 * The requested uid must be representable in the
4661 * filesystem's idmapping.
4662 */
4663 if (!kuid_has_mapping(fc->user_ns, kuid))
4664 goto bad_value;
4665
4666 ctx->uid = kuid;
4667 break;
4668 case Opt_gid:
4669 kgid = result.gid;
4670
4671 /*
4672 * The requested gid must be representable in the
4673 * filesystem's idmapping.
4674 */
4675 if (!kgid_has_mapping(fc->user_ns, kgid))
4676 goto bad_value;
4677
4678 ctx->gid = kgid;
4679 break;
4680 case Opt_huge:
4681 ctx->huge = result.uint_32;
4682 if (ctx->huge != SHMEM_HUGE_NEVER &&
4683 !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
4684 has_transparent_hugepage()))
4685 goto unsupported_parameter;
4686 ctx->seen |= SHMEM_SEEN_HUGE;
4687 break;
4688 case Opt_mpol:
4689 if (IS_ENABLED(CONFIG_NUMA)) {
4690 mpol_put(ctx->mpol);
4691 ctx->mpol = NULL;
4692 if (mpol_parse_str(param->string, &ctx->mpol))
4693 goto bad_value;
4694 break;
4695 }
4696 goto unsupported_parameter;
4697 case Opt_inode32:
4698 ctx->full_inums = false;
4699 ctx->seen |= SHMEM_SEEN_INUMS;
4700 break;
4701 case Opt_inode64:
4702 if (sizeof(ino_t) < 8) {
4703 return invalfc(fc,
4704 "Cannot use inode64 with <64bit inums in kernel\n");
4705 }
4706 ctx->full_inums = true;
4707 ctx->seen |= SHMEM_SEEN_INUMS;
4708 break;
4709 case Opt_noswap:
4710 if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
4711 return invalfc(fc,
4712 "Turning off swap in unprivileged tmpfs mounts unsupported");
4713 }
4714 ctx->noswap = true;
4715 ctx->seen |= SHMEM_SEEN_NOSWAP;
4716 break;
4717 case Opt_quota:
4718 if (fc->user_ns != &init_user_ns)
4719 return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4720 ctx->seen |= SHMEM_SEEN_QUOTA;
4721 ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP);
4722 break;
4723 case Opt_usrquota:
4724 if (fc->user_ns != &init_user_ns)
4725 return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4726 ctx->seen |= SHMEM_SEEN_QUOTA;
4727 ctx->quota_types |= QTYPE_MASK_USR;
4728 break;
4729 case Opt_grpquota:
4730 if (fc->user_ns != &init_user_ns)
4731 return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4732 ctx->seen |= SHMEM_SEEN_QUOTA;
4733 ctx->quota_types |= QTYPE_MASK_GRP;
4734 break;
4735 case Opt_usrquota_block_hardlimit:
4736 size = memparse(param->string, &rest);
4737 if (*rest || !size)
4738 goto bad_value;
4739 if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
4740 return invalfc(fc,
4741 "User quota block hardlimit too large.");
4742 ctx->qlimits.usrquota_bhardlimit = size;
4743 break;
4744 case Opt_grpquota_block_hardlimit:
4745 size = memparse(param->string, &rest);
4746 if (*rest || !size)
4747 goto bad_value;
4748 if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
4749 return invalfc(fc,
4750 "Group quota block hardlimit too large.");
4751 ctx->qlimits.grpquota_bhardlimit = size;
4752 break;
4753 case Opt_usrquota_inode_hardlimit:
4754 size = memparse(param->string, &rest);
4755 if (*rest || !size)
4756 goto bad_value;
4757 if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
4758 return invalfc(fc,
4759 "User quota inode hardlimit too large.");
4760 ctx->qlimits.usrquota_ihardlimit = size;
4761 break;
4762 case Opt_grpquota_inode_hardlimit:
4763 size = memparse(param->string, &rest);
4764 if (*rest || !size)
4765 goto bad_value;
4766 if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
4767 return invalfc(fc,
4768 "Group quota inode hardlimit too large.");
4769 ctx->qlimits.grpquota_ihardlimit = size;
4770 break;
4771 case Opt_casefold_version:
4772 return shmem_parse_opt_casefold(fc, param, false);
4773 case Opt_casefold:
4774 return shmem_parse_opt_casefold(fc, param, true);
4775 case Opt_strict_encoding:
4776 #if IS_ENABLED(CONFIG_UNICODE)
4777 ctx->strict_encoding = true;
4778 break;
4779 #else
4780 return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
4781 #endif
4782 }
4783 return 0;
4784
4785 unsupported_parameter:
4786 return invalfc(fc, "Unsupported parameter '%s'", param->key);
4787 bad_value:
4788 return invalfc(fc, "Bad value for '%s'", param->key);
4789 }
4790
shmem_next_opt(char ** s)4791 static char *shmem_next_opt(char **s)
4792 {
4793 char *sbegin = *s;
4794 char *p;
4795
4796 if (sbegin == NULL)
4797 return NULL;
4798
4799 /*
4800 * NUL-terminate this option: unfortunately,
4801 * mount options form a comma-separated list,
4802 * but mpol's nodelist may also contain commas.
4803 */
4804 for (;;) {
4805 p = strchr(*s, ',');
4806 if (p == NULL)
4807 break;
4808 *s = p + 1;
4809 if (!isdigit(*(p+1))) {
4810 *p = '\0';
4811 return sbegin;
4812 }
4813 }
4814
4815 *s = NULL;
4816 return sbegin;
4817 }
4818
shmem_parse_monolithic(struct fs_context * fc,void * data)4819 static int shmem_parse_monolithic(struct fs_context *fc, void *data)
4820 {
4821 return vfs_parse_monolithic_sep(fc, data, shmem_next_opt);
4822 }
4823
4824 /*
4825 * Reconfigure a shmem filesystem.
4826 */
shmem_reconfigure(struct fs_context * fc)4827 static int shmem_reconfigure(struct fs_context *fc)
4828 {
4829 struct shmem_options *ctx = fc->fs_private;
4830 struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
4831 unsigned long used_isp;
4832 struct mempolicy *mpol = NULL;
4833 const char *err;
4834
4835 raw_spin_lock(&sbinfo->stat_lock);
4836 used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;
4837
4838 if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
4839 if (!sbinfo->max_blocks) {
4840 err = "Cannot retroactively limit size";
4841 goto out;
4842 }
4843 if (percpu_counter_compare(&sbinfo->used_blocks,
4844 ctx->blocks) > 0) {
4845 err = "Too small a size for current use";
4846 goto out;
4847 }
4848 }
4849 if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
4850 if (!sbinfo->max_inodes) {
4851 err = "Cannot retroactively limit inodes";
4852 goto out;
4853 }
4854 if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
4855 err = "Too few inodes for current use";
4856 goto out;
4857 }
4858 }
4859
4860 if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
4861 sbinfo->next_ino > UINT_MAX) {
4862 err = "Current inum too high to switch to 32-bit inums";
4863 goto out;
4864 }
4865 if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
4866 err = "Cannot disable swap on remount";
4867 goto out;
4868 }
4869 if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
4870 err = "Cannot enable swap on remount if it was disabled on first mount";
4871 goto out;
4872 }
4873
4874 if (ctx->seen & SHMEM_SEEN_QUOTA &&
4875 !sb_any_quota_loaded(fc->root->d_sb)) {
4876 err = "Cannot enable quota on remount";
4877 goto out;
4878 }
4879
4880 #ifdef CONFIG_TMPFS_QUOTA
4881 #define CHANGED_LIMIT(name) \
4882 (ctx->qlimits.name## hardlimit && \
4883 (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))
4884
4885 if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) ||
4886 CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) {
4887 err = "Cannot change global quota limit on remount";
4888 goto out;
4889 }
4890 #endif /* CONFIG_TMPFS_QUOTA */
4891
4892 if (ctx->seen & SHMEM_SEEN_HUGE)
4893 sbinfo->huge = ctx->huge;
4894 if (ctx->seen & SHMEM_SEEN_INUMS)
4895 sbinfo->full_inums = ctx->full_inums;
4896 if (ctx->seen & SHMEM_SEEN_BLOCKS)
4897 sbinfo->max_blocks = ctx->blocks;
4898 if (ctx->seen & SHMEM_SEEN_INODES) {
4899 sbinfo->max_inodes = ctx->inodes;
4900 sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
4901 }
4902
4903 /*
4904 * Preserve previous mempolicy unless mpol remount option was specified.
4905 */
4906 if (ctx->mpol) {
4907 mpol = sbinfo->mpol;
4908 sbinfo->mpol = ctx->mpol; /* transfers initial ref */
4909 ctx->mpol = NULL;
4910 }
4911
4912 if (ctx->noswap)
4913 sbinfo->noswap = true;
4914
4915 raw_spin_unlock(&sbinfo->stat_lock);
4916 mpol_put(mpol);
4917 return 0;
4918 out:
4919 raw_spin_unlock(&sbinfo->stat_lock);
4920 return invalfc(fc, "%s", err);
4921 }
4922
shmem_show_options(struct seq_file * seq,struct dentry * root)4923 static int shmem_show_options(struct seq_file *seq, struct dentry *root)
4924 {
4925 struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
4926 struct mempolicy *mpol;
4927
4928 if (sbinfo->max_blocks != shmem_default_max_blocks())
4929 seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks));
4930 if (sbinfo->max_inodes != shmem_default_max_inodes())
4931 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
4932 if (sbinfo->mode != (0777 | S_ISVTX))
4933 seq_printf(seq, ",mode=%03ho", sbinfo->mode);
4934 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
4935 seq_printf(seq, ",uid=%u",
4936 from_kuid_munged(&init_user_ns, sbinfo->uid));
4937 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
4938 seq_printf(seq, ",gid=%u",
4939 from_kgid_munged(&init_user_ns, sbinfo->gid));
4940
4941 /*
4942 * Showing inode{64,32} might be useful even if it's the system default,
4943 * since then people don't have to resort to checking both here and
4944 * /proc/config.gz to confirm 64-bit inums were successfully applied
4945 * (which may not even exist if IKCONFIG_PROC isn't enabled).
4946 *
4947 * We hide it when inode64 isn't the default and we are using 32-bit
4948 * inodes, since that probably just means the feature isn't even under
4949 * consideration.
4950 *
4951 * As such:
4952 *
4953 * +-----------------+-----------------+
4954 * | TMPFS_INODE64=y | TMPFS_INODE64=n |
4955 * +------------------+-----------------+-----------------+
4956 * | full_inums=true | show | show |
4957 * | full_inums=false | show | hide |
4958 * +------------------+-----------------+-----------------+
4959 *
4960 */
4961 if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
4962 seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
4963 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4964 /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
4965 if (sbinfo->huge)
4966 seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
4967 #endif
4968 mpol = shmem_get_sbmpol(sbinfo);
4969 shmem_show_mpol(seq, mpol);
4970 mpol_put(mpol);
4971 if (sbinfo->noswap)
4972 seq_printf(seq, ",noswap");
4973 #ifdef CONFIG_TMPFS_QUOTA
4974 if (sb_has_quota_active(root->d_sb, USRQUOTA))
4975 seq_printf(seq, ",usrquota");
4976 if (sb_has_quota_active(root->d_sb, GRPQUOTA))
4977 seq_printf(seq, ",grpquota");
4978 if (sbinfo->qlimits.usrquota_bhardlimit)
4979 seq_printf(seq, ",usrquota_block_hardlimit=%lld",
4980 sbinfo->qlimits.usrquota_bhardlimit);
4981 if (sbinfo->qlimits.grpquota_bhardlimit)
4982 seq_printf(seq, ",grpquota_block_hardlimit=%lld",
4983 sbinfo->qlimits.grpquota_bhardlimit);
4984 if (sbinfo->qlimits.usrquota_ihardlimit)
4985 seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
4986 sbinfo->qlimits.usrquota_ihardlimit);
4987 if (sbinfo->qlimits.grpquota_ihardlimit)
4988 seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
4989 sbinfo->qlimits.grpquota_ihardlimit);
4990 #endif
4991 return 0;
4992 }
4993
4994 #endif /* CONFIG_TMPFS */
4995
shmem_put_super(struct super_block * sb)4996 static void shmem_put_super(struct super_block *sb)
4997 {
4998 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
4999
5000 #if IS_ENABLED(CONFIG_UNICODE)
5001 if (sb->s_encoding)
5002 utf8_unload(sb->s_encoding);
5003 #endif
5004
5005 #ifdef CONFIG_TMPFS_QUOTA
5006 shmem_disable_quotas(sb);
5007 #endif
5008 free_percpu(sbinfo->ino_batch);
5009 percpu_counter_destroy(&sbinfo->used_blocks);
5010 mpol_put(sbinfo->mpol);
5011 kfree(sbinfo);
5012 sb->s_fs_info = NULL;
5013 }
5014
5015 #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS)
5016 static const struct dentry_operations shmem_ci_dentry_ops = {
5017 .d_hash = generic_ci_d_hash,
5018 .d_compare = generic_ci_d_compare,
5019 };
5020 #endif
5021
shmem_fill_super(struct super_block * sb,struct fs_context * fc)5022 static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
5023 {
5024 struct shmem_options *ctx = fc->fs_private;
5025 struct inode *inode;
5026 struct shmem_sb_info *sbinfo;
5027 int error = -ENOMEM;
5028
5029 /* Round up to L1_CACHE_BYTES to resist false sharing */
5030 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
5031 L1_CACHE_BYTES), GFP_KERNEL);
5032 if (!sbinfo)
5033 return error;
5034
5035 sb->s_fs_info = sbinfo;
5036
5037 #ifdef CONFIG_TMPFS
5038 /*
5039 * Per default we only allow half of the physical ram per
5040 * tmpfs instance, limiting inodes to one per page of lowmem;
5041 * but the internal instance is left unlimited.
5042 */
5043 if (!(sb->s_flags & SB_KERNMOUNT)) {
5044 if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
5045 ctx->blocks = shmem_default_max_blocks();
5046 if (!(ctx->seen & SHMEM_SEEN_INODES))
5047 ctx->inodes = shmem_default_max_inodes();
5048 if (!(ctx->seen & SHMEM_SEEN_INUMS))
5049 ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
5050 sbinfo->noswap = ctx->noswap;
5051 } else {
5052 sb->s_flags |= SB_NOUSER;
5053 }
5054 sb->s_export_op = &shmem_export_ops;
5055 sb->s_flags |= SB_NOSEC | SB_I_VERSION;
5056
5057 #if IS_ENABLED(CONFIG_UNICODE)
5058 if (!ctx->encoding && ctx->strict_encoding) {
5059 pr_err("tmpfs: strict_encoding option without encoding is forbidden\n");
5060 error = -EINVAL;
5061 goto failed;
5062 }
5063
5064 if (ctx->encoding) {
5065 sb->s_encoding = ctx->encoding;
5066 set_default_d_op(sb, &shmem_ci_dentry_ops);
5067 if (ctx->strict_encoding)
5068 sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
5069 }
5070 #endif
5071
5072 #else
5073 sb->s_flags |= SB_NOUSER;
5074 #endif /* CONFIG_TMPFS */
5075 sb->s_d_flags |= DCACHE_DONTCACHE;
5076 sbinfo->max_blocks = ctx->blocks;
5077 sbinfo->max_inodes = ctx->inodes;
5078 sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
5079 if (sb->s_flags & SB_KERNMOUNT) {
5080 sbinfo->ino_batch = alloc_percpu(ino_t);
5081 if (!sbinfo->ino_batch)
5082 goto failed;
5083 }
5084 sbinfo->uid = ctx->uid;
5085 sbinfo->gid = ctx->gid;
5086 sbinfo->full_inums = ctx->full_inums;
5087 sbinfo->mode = ctx->mode;
5088 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5089 if (ctx->seen & SHMEM_SEEN_HUGE)
5090 sbinfo->huge = ctx->huge;
5091 else
5092 sbinfo->huge = tmpfs_huge;
5093 #endif
5094 sbinfo->mpol = ctx->mpol;
5095 ctx->mpol = NULL;
5096
5097 raw_spin_lock_init(&sbinfo->stat_lock);
5098 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
5099 goto failed;
5100 spin_lock_init(&sbinfo->shrinklist_lock);
5101 INIT_LIST_HEAD(&sbinfo->shrinklist);
5102
5103 sb->s_maxbytes = MAX_LFS_FILESIZE;
5104 sb->s_blocksize = PAGE_SIZE;
5105 sb->s_blocksize_bits = PAGE_SHIFT;
5106 sb->s_magic = TMPFS_MAGIC;
5107 sb->s_op = &shmem_ops;
5108 sb->s_time_gran = 1;
5109 #ifdef CONFIG_TMPFS_XATTR
5110 sb->s_xattr = shmem_xattr_handlers;
5111 #endif
5112 #ifdef CONFIG_TMPFS_POSIX_ACL
5113 sb->s_flags |= SB_POSIXACL;
5114 #endif
5115 uuid_t uuid;
5116 uuid_gen(&uuid);
5117 super_set_uuid(sb, uuid.b, sizeof(uuid));
5118
5119 #ifdef CONFIG_TMPFS_QUOTA
5120 if (ctx->seen & SHMEM_SEEN_QUOTA) {
5121 sb->dq_op = &shmem_quota_operations;
5122 sb->s_qcop = &dquot_quotactl_sysfile_ops;
5123 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
5124
5125 /* Copy the default limits from ctx into sbinfo */
5126 memcpy(&sbinfo->qlimits, &ctx->qlimits,
5127 sizeof(struct shmem_quota_limits));
5128
5129 if (shmem_enable_quotas(sb, ctx->quota_types))
5130 goto failed;
5131 }
5132 #endif /* CONFIG_TMPFS_QUOTA */
5133
5134 inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
5135 S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
5136 if (IS_ERR(inode)) {
5137 error = PTR_ERR(inode);
5138 goto failed;
5139 }
5140 inode->i_uid = sbinfo->uid;
5141 inode->i_gid = sbinfo->gid;
5142 sb->s_root = d_make_root(inode);
5143 if (!sb->s_root)
5144 goto failed;
5145 return 0;
5146
5147 failed:
5148 shmem_put_super(sb);
5149 return error;
5150 }
5151
shmem_get_tree(struct fs_context * fc)5152 static int shmem_get_tree(struct fs_context *fc)
5153 {
5154 return get_tree_nodev(fc, shmem_fill_super);
5155 }
5156
shmem_free_fc(struct fs_context * fc)5157 static void shmem_free_fc(struct fs_context *fc)
5158 {
5159 struct shmem_options *ctx = fc->fs_private;
5160
5161 if (ctx) {
5162 mpol_put(ctx->mpol);
5163 kfree(ctx);
5164 }
5165 }
5166
5167 static const struct fs_context_operations shmem_fs_context_ops = {
5168 .free = shmem_free_fc,
5169 .get_tree = shmem_get_tree,
5170 #ifdef CONFIG_TMPFS
5171 .parse_monolithic = shmem_parse_monolithic,
5172 .parse_param = shmem_parse_one,
5173 .reconfigure = shmem_reconfigure,
5174 #endif
5175 };
5176
5177 static struct kmem_cache *shmem_inode_cachep __ro_after_init;
5178
shmem_alloc_inode(struct super_block * sb)5179 static struct inode *shmem_alloc_inode(struct super_block *sb)
5180 {
5181 struct shmem_inode_info *info;
5182 info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
5183 if (!info)
5184 return NULL;
5185 return &info->vfs_inode;
5186 }
5187
shmem_free_in_core_inode(struct inode * inode)5188 static void shmem_free_in_core_inode(struct inode *inode)
5189 {
5190 if (S_ISLNK(inode->i_mode))
5191 kfree(inode->i_link);
5192 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
5193 }
5194
shmem_destroy_inode(struct inode * inode)5195 static void shmem_destroy_inode(struct inode *inode)
5196 {
5197 if (S_ISREG(inode->i_mode))
5198 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
5199 if (S_ISDIR(inode->i_mode))
5200 simple_offset_destroy(shmem_get_offset_ctx(inode));
5201 }
5202
shmem_init_inode(void * foo)5203 static void shmem_init_inode(void *foo)
5204 {
5205 struct shmem_inode_info *info = foo;
5206 inode_init_once(&info->vfs_inode);
5207 }
5208
shmem_init_inodecache(void)5209 static void __init shmem_init_inodecache(void)
5210 {
5211 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
5212 sizeof(struct shmem_inode_info),
5213 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
5214 }
5215
shmem_destroy_inodecache(void)5216 static void __init shmem_destroy_inodecache(void)
5217 {
5218 kmem_cache_destroy(shmem_inode_cachep);
5219 }
5220
5221 /* Keep the page in page cache instead of truncating it */
shmem_error_remove_folio(struct address_space * mapping,struct folio * folio)5222 static int shmem_error_remove_folio(struct address_space *mapping,
5223 struct folio *folio)
5224 {
5225 return 0;
5226 }
5227
5228 static const struct address_space_operations shmem_aops = {
5229 .dirty_folio = noop_dirty_folio,
5230 #ifdef CONFIG_TMPFS
5231 .write_begin = shmem_write_begin,
5232 .write_end = shmem_write_end,
5233 #endif
5234 #ifdef CONFIG_MIGRATION
5235 .migrate_folio = migrate_folio,
5236 #endif
5237 .error_remove_folio = shmem_error_remove_folio,
5238 };
5239
5240 static const struct file_operations shmem_file_operations = {
5241 .mmap = shmem_mmap,
5242 .open = shmem_file_open,
5243 .get_unmapped_area = shmem_get_unmapped_area,
5244 #ifdef CONFIG_TMPFS
5245 .llseek = shmem_file_llseek,
5246 .read_iter = shmem_file_read_iter,
5247 .write_iter = shmem_file_write_iter,
5248 .fsync = noop_fsync,
5249 .splice_read = shmem_file_splice_read,
5250 .splice_write = iter_file_splice_write,
5251 .fallocate = shmem_fallocate,
5252 #endif
5253 };
5254
5255 static const struct inode_operations shmem_inode_operations = {
5256 .getattr = shmem_getattr,
5257 .setattr = shmem_setattr,
5258 #ifdef CONFIG_TMPFS_XATTR
5259 .listxattr = shmem_listxattr,
5260 .set_acl = simple_set_acl,
5261 .fileattr_get = shmem_fileattr_get,
5262 .fileattr_set = shmem_fileattr_set,
5263 #endif
5264 };
5265
5266 static const struct inode_operations shmem_dir_inode_operations = {
5267 #ifdef CONFIG_TMPFS
5268 .getattr = shmem_getattr,
5269 .create = shmem_create,
5270 .lookup = simple_lookup,
5271 .link = shmem_link,
5272 .unlink = shmem_unlink,
5273 .symlink = shmem_symlink,
5274 .mkdir = shmem_mkdir,
5275 .rmdir = shmem_rmdir,
5276 .mknod = shmem_mknod,
5277 .rename = shmem_rename2,
5278 .tmpfile = shmem_tmpfile,
5279 .get_offset_ctx = shmem_get_offset_ctx,
5280 #endif
5281 #ifdef CONFIG_TMPFS_XATTR
5282 .listxattr = shmem_listxattr,
5283 .fileattr_get = shmem_fileattr_get,
5284 .fileattr_set = shmem_fileattr_set,
5285 #endif
5286 #ifdef CONFIG_TMPFS_POSIX_ACL
5287 .setattr = shmem_setattr,
5288 .set_acl = simple_set_acl,
5289 #endif
5290 };
5291
5292 static const struct inode_operations shmem_special_inode_operations = {
5293 .getattr = shmem_getattr,
5294 #ifdef CONFIG_TMPFS_XATTR
5295 .listxattr = shmem_listxattr,
5296 #endif
5297 #ifdef CONFIG_TMPFS_POSIX_ACL
5298 .setattr = shmem_setattr,
5299 .set_acl = simple_set_acl,
5300 #endif
5301 };
5302
5303 static const struct super_operations shmem_ops = {
5304 .alloc_inode = shmem_alloc_inode,
5305 .free_inode = shmem_free_in_core_inode,
5306 .destroy_inode = shmem_destroy_inode,
5307 #ifdef CONFIG_TMPFS
5308 .statfs = shmem_statfs,
5309 .show_options = shmem_show_options,
5310 #endif
5311 #ifdef CONFIG_TMPFS_QUOTA
5312 .get_dquots = shmem_get_dquots,
5313 #endif
5314 .evict_inode = shmem_evict_inode,
5315 .drop_inode = generic_delete_inode,
5316 .put_super = shmem_put_super,
5317 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5318 .nr_cached_objects = shmem_unused_huge_count,
5319 .free_cached_objects = shmem_unused_huge_scan,
5320 #endif
5321 };
5322
5323 static const struct vm_operations_struct shmem_vm_ops = {
5324 .fault = shmem_fault,
5325 .map_pages = filemap_map_pages,
5326 #ifdef CONFIG_NUMA
5327 .set_policy = shmem_set_policy,
5328 .get_policy = shmem_get_policy,
5329 #endif
5330 };
5331
5332 static const struct vm_operations_struct shmem_anon_vm_ops = {
5333 .fault = shmem_fault,
5334 .map_pages = filemap_map_pages,
5335 #ifdef CONFIG_NUMA
5336 .set_policy = shmem_set_policy,
5337 .get_policy = shmem_get_policy,
5338 #endif
5339 };
5340
shmem_init_fs_context(struct fs_context * fc)5341 int shmem_init_fs_context(struct fs_context *fc)
5342 {
5343 struct shmem_options *ctx;
5344
5345 ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
5346 if (!ctx)
5347 return -ENOMEM;
5348
5349 ctx->mode = 0777 | S_ISVTX;
5350 ctx->uid = current_fsuid();
5351 ctx->gid = current_fsgid();
5352
5353 #if IS_ENABLED(CONFIG_UNICODE)
5354 ctx->encoding = NULL;
5355 #endif
5356
5357 fc->fs_private = ctx;
5358 fc->ops = &shmem_fs_context_ops;
5359 return 0;
5360 }
5361
5362 static struct file_system_type shmem_fs_type = {
5363 .owner = THIS_MODULE,
5364 .name = "tmpfs",
5365 .init_fs_context = shmem_init_fs_context,
5366 #ifdef CONFIG_TMPFS
5367 .parameters = shmem_fs_parameters,
5368 #endif
5369 .kill_sb = kill_litter_super,
5370 .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
5371 };
5372
5373 #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
5374
5375 #define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \
5376 { \
5377 .attr = { .name = __stringify(_name), .mode = _mode }, \
5378 .show = _show, \
5379 .store = _store, \
5380 }
5381
5382 #define TMPFS_ATTR_W(_name, _store) \
5383 static struct kobj_attribute tmpfs_attr_##_name = \
5384 __INIT_KOBJ_ATTR(_name, 0200, NULL, _store)
5385
5386 #define TMPFS_ATTR_RW(_name, _show, _store) \
5387 static struct kobj_attribute tmpfs_attr_##_name = \
5388 __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
5389
5390 #define TMPFS_ATTR_RO(_name, _show) \
5391 static struct kobj_attribute tmpfs_attr_##_name = \
5392 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
5393
5394 #if IS_ENABLED(CONFIG_UNICODE)
casefold_show(struct kobject * kobj,struct kobj_attribute * a,char * buf)5395 static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a,
5396 char *buf)
5397 {
5398 return sysfs_emit(buf, "supported\n");
5399 }
5400 TMPFS_ATTR_RO(casefold, casefold_show);
5401 #endif
5402
5403 static struct attribute *tmpfs_attributes[] = {
5404 #if IS_ENABLED(CONFIG_UNICODE)
5405 &tmpfs_attr_casefold.attr,
5406 #endif
5407 NULL
5408 };
5409
5410 static const struct attribute_group tmpfs_attribute_group = {
5411 .attrs = tmpfs_attributes,
5412 .name = "features"
5413 };
5414
5415 static struct kobject *tmpfs_kobj;
5416
tmpfs_sysfs_init(void)5417 static int __init tmpfs_sysfs_init(void)
5418 {
5419 int ret;
5420
5421 tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj);
5422 if (!tmpfs_kobj)
5423 return -ENOMEM;
5424
5425 ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group);
5426 if (ret)
5427 kobject_put(tmpfs_kobj);
5428
5429 return ret;
5430 }
5431 #endif /* CONFIG_SYSFS && CONFIG_TMPFS */
5432
shmem_init(void)5433 void __init shmem_init(void)
5434 {
5435 int error;
5436
5437 shmem_init_inodecache();
5438
5439 #ifdef CONFIG_TMPFS_QUOTA
5440 register_quota_format(&shmem_quota_format);
5441 #endif
5442
5443 error = register_filesystem(&shmem_fs_type);
5444 if (error) {
5445 pr_err("Could not register tmpfs\n");
5446 goto out2;
5447 }
5448
5449 shm_mnt = kern_mount(&shmem_fs_type);
5450 if (IS_ERR(shm_mnt)) {
5451 error = PTR_ERR(shm_mnt);
5452 pr_err("Could not kern_mount tmpfs\n");
5453 goto out1;
5454 }
5455
5456 #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
5457 error = tmpfs_sysfs_init();
5458 if (error) {
5459 pr_err("Could not init tmpfs sysfs\n");
5460 goto out1;
5461 }
5462 #endif
5463
5464 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5465 if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
5466 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
5467 else
5468 shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
5469
5470 /*
5471 * Default to setting PMD-sized THP to inherit the global setting and
5472 * disable all other multi-size THPs.
5473 */
5474 if (!shmem_orders_configured)
5475 huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
5476 #endif
5477 return;
5478
5479 out1:
5480 unregister_filesystem(&shmem_fs_type);
5481 out2:
5482 #ifdef CONFIG_TMPFS_QUOTA
5483 unregister_quota_format(&shmem_quota_format);
5484 #endif
5485 shmem_destroy_inodecache();
5486 shm_mnt = ERR_PTR(error);
5487 }
5488
5489 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
shmem_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)5490 static ssize_t shmem_enabled_show(struct kobject *kobj,
5491 struct kobj_attribute *attr, char *buf)
5492 {
5493 static const int values[] = {
5494 SHMEM_HUGE_ALWAYS,
5495 SHMEM_HUGE_WITHIN_SIZE,
5496 SHMEM_HUGE_ADVISE,
5497 SHMEM_HUGE_NEVER,
5498 SHMEM_HUGE_DENY,
5499 SHMEM_HUGE_FORCE,
5500 };
5501 int len = 0;
5502 int i;
5503
5504 for (i = 0; i < ARRAY_SIZE(values); i++) {
5505 len += sysfs_emit_at(buf, len,
5506 shmem_huge == values[i] ? "%s[%s]" : "%s%s",
5507 i ? " " : "", shmem_format_huge(values[i]));
5508 }
5509 len += sysfs_emit_at(buf, len, "\n");
5510
5511 return len;
5512 }
5513
shmem_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)5514 static ssize_t shmem_enabled_store(struct kobject *kobj,
5515 struct kobj_attribute *attr, const char *buf, size_t count)
5516 {
5517 char tmp[16];
5518 int huge, err;
5519
5520 if (count + 1 > sizeof(tmp))
5521 return -EINVAL;
5522 memcpy(tmp, buf, count);
5523 tmp[count] = '\0';
5524 if (count && tmp[count - 1] == '\n')
5525 tmp[count - 1] = '\0';
5526
5527 huge = shmem_parse_huge(tmp);
5528 if (huge == -EINVAL)
5529 return huge;
5530
5531 shmem_huge = huge;
5532 if (shmem_huge > SHMEM_HUGE_DENY)
5533 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
5534
5535 err = start_stop_khugepaged();
5536 return err ? err : count;
5537 }
5538
5539 struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
5540 static DEFINE_SPINLOCK(huge_shmem_orders_lock);
5541
thpsize_shmem_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)5542 static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj,
5543 struct kobj_attribute *attr, char *buf)
5544 {
5545 int order = to_thpsize(kobj)->order;
5546 const char *output;
5547
5548 if (test_bit(order, &huge_shmem_orders_always))
5549 output = "[always] inherit within_size advise never";
5550 else if (test_bit(order, &huge_shmem_orders_inherit))
5551 output = "always [inherit] within_size advise never";
5552 else if (test_bit(order, &huge_shmem_orders_within_size))
5553 output = "always inherit [within_size] advise never";
5554 else if (test_bit(order, &huge_shmem_orders_madvise))
5555 output = "always inherit within_size [advise] never";
5556 else
5557 output = "always inherit within_size advise [never]";
5558
5559 return sysfs_emit(buf, "%s\n", output);
5560 }
5561
thpsize_shmem_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)5562 static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
5563 struct kobj_attribute *attr,
5564 const char *buf, size_t count)
5565 {
5566 int order = to_thpsize(kobj)->order;
5567 ssize_t ret = count;
5568
5569 if (sysfs_streq(buf, "always")) {
5570 spin_lock(&huge_shmem_orders_lock);
5571 clear_bit(order, &huge_shmem_orders_inherit);
5572 clear_bit(order, &huge_shmem_orders_madvise);
5573 clear_bit(order, &huge_shmem_orders_within_size);
5574 set_bit(order, &huge_shmem_orders_always);
5575 spin_unlock(&huge_shmem_orders_lock);
5576 } else if (sysfs_streq(buf, "inherit")) {
5577 /* Do not override huge allocation policy with non-PMD sized mTHP */
5578 if (shmem_huge == SHMEM_HUGE_FORCE &&
5579 order != HPAGE_PMD_ORDER)
5580 return -EINVAL;
5581
5582 spin_lock(&huge_shmem_orders_lock);
5583 clear_bit(order, &huge_shmem_orders_always);
5584 clear_bit(order, &huge_shmem_orders_madvise);
5585 clear_bit(order, &huge_shmem_orders_within_size);
5586 set_bit(order, &huge_shmem_orders_inherit);
5587 spin_unlock(&huge_shmem_orders_lock);
5588 } else if (sysfs_streq(buf, "within_size")) {
5589 spin_lock(&huge_shmem_orders_lock);
5590 clear_bit(order, &huge_shmem_orders_always);
5591 clear_bit(order, &huge_shmem_orders_inherit);
5592 clear_bit(order, &huge_shmem_orders_madvise);
5593 set_bit(order, &huge_shmem_orders_within_size);
5594 spin_unlock(&huge_shmem_orders_lock);
5595 } else if (sysfs_streq(buf, "advise")) {
5596 spin_lock(&huge_shmem_orders_lock);
5597 clear_bit(order, &huge_shmem_orders_always);
5598 clear_bit(order, &huge_shmem_orders_inherit);
5599 clear_bit(order, &huge_shmem_orders_within_size);
5600 set_bit(order, &huge_shmem_orders_madvise);
5601 spin_unlock(&huge_shmem_orders_lock);
5602 } else if (sysfs_streq(buf, "never")) {
5603 spin_lock(&huge_shmem_orders_lock);
5604 clear_bit(order, &huge_shmem_orders_always);
5605 clear_bit(order, &huge_shmem_orders_inherit);
5606 clear_bit(order, &huge_shmem_orders_within_size);
5607 clear_bit(order, &huge_shmem_orders_madvise);
5608 spin_unlock(&huge_shmem_orders_lock);
5609 } else {
5610 ret = -EINVAL;
5611 }
5612
5613 if (ret > 0) {
5614 int err = start_stop_khugepaged();
5615
5616 if (err)
5617 ret = err;
5618 }
5619 return ret;
5620 }
5621
5622 struct kobj_attribute thpsize_shmem_enabled_attr =
5623 __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
5624 #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
5625
5626 #if defined(CONFIG_TRANSPARENT_HUGEPAGE)
5627
setup_transparent_hugepage_shmem(char * str)5628 static int __init setup_transparent_hugepage_shmem(char *str)
5629 {
5630 int huge;
5631
5632 huge = shmem_parse_huge(str);
5633 if (huge == -EINVAL) {
5634 pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n");
5635 return huge;
5636 }
5637
5638 shmem_huge = huge;
5639 return 1;
5640 }
5641 __setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem);
5642
setup_transparent_hugepage_tmpfs(char * str)5643 static int __init setup_transparent_hugepage_tmpfs(char *str)
5644 {
5645 int huge;
5646
5647 huge = shmem_parse_huge(str);
5648 if (huge < 0) {
5649 pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n");
5650 return huge;
5651 }
5652
5653 tmpfs_huge = huge;
5654 return 1;
5655 }
5656 __setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs);
5657
5658 static char str_dup[PAGE_SIZE] __initdata;
setup_thp_shmem(char * str)5659 static int __init setup_thp_shmem(char *str)
5660 {
5661 char *token, *range, *policy, *subtoken;
5662 unsigned long always, inherit, madvise, within_size;
5663 char *start_size, *end_size;
5664 int start, end, nr;
5665 char *p;
5666
5667 if (!str || strlen(str) + 1 > PAGE_SIZE)
5668 goto err;
5669 strscpy(str_dup, str);
5670
5671 always = huge_shmem_orders_always;
5672 inherit = huge_shmem_orders_inherit;
5673 madvise = huge_shmem_orders_madvise;
5674 within_size = huge_shmem_orders_within_size;
5675 p = str_dup;
5676 while ((token = strsep(&p, ";")) != NULL) {
5677 range = strsep(&token, ":");
5678 policy = token;
5679
5680 if (!policy)
5681 goto err;
5682
5683 while ((subtoken = strsep(&range, ",")) != NULL) {
5684 if (strchr(subtoken, '-')) {
5685 start_size = strsep(&subtoken, "-");
5686 end_size = subtoken;
5687
5688 start = get_order_from_str(start_size,
5689 THP_ORDERS_ALL_FILE_DEFAULT);
5690 end = get_order_from_str(end_size,
5691 THP_ORDERS_ALL_FILE_DEFAULT);
5692 } else {
5693 start_size = end_size = subtoken;
5694 start = end = get_order_from_str(subtoken,
5695 THP_ORDERS_ALL_FILE_DEFAULT);
5696 }
5697
5698 if (start < 0) {
5699 pr_err("invalid size %s in thp_shmem boot parameter\n",
5700 start_size);
5701 goto err;
5702 }
5703
5704 if (end < 0) {
5705 pr_err("invalid size %s in thp_shmem boot parameter\n",
5706 end_size);
5707 goto err;
5708 }
5709
5710 if (start > end)
5711 goto err;
5712
5713 nr = end - start + 1;
5714 if (!strcmp(policy, "always")) {
5715 bitmap_set(&always, start, nr);
5716 bitmap_clear(&inherit, start, nr);
5717 bitmap_clear(&madvise, start, nr);
5718 bitmap_clear(&within_size, start, nr);
5719 } else if (!strcmp(policy, "advise")) {
5720 bitmap_set(&madvise, start, nr);
5721 bitmap_clear(&inherit, start, nr);
5722 bitmap_clear(&always, start, nr);
5723 bitmap_clear(&within_size, start, nr);
5724 } else if (!strcmp(policy, "inherit")) {
5725 bitmap_set(&inherit, start, nr);
5726 bitmap_clear(&madvise, start, nr);
5727 bitmap_clear(&always, start, nr);
5728 bitmap_clear(&within_size, start, nr);
5729 } else if (!strcmp(policy, "within_size")) {
5730 bitmap_set(&within_size, start, nr);
5731 bitmap_clear(&inherit, start, nr);
5732 bitmap_clear(&madvise, start, nr);
5733 bitmap_clear(&always, start, nr);
5734 } else if (!strcmp(policy, "never")) {
5735 bitmap_clear(&inherit, start, nr);
5736 bitmap_clear(&madvise, start, nr);
5737 bitmap_clear(&always, start, nr);
5738 bitmap_clear(&within_size, start, nr);
5739 } else {
5740 pr_err("invalid policy %s in thp_shmem boot parameter\n", policy);
5741 goto err;
5742 }
5743 }
5744 }
5745
5746 huge_shmem_orders_always = always;
5747 huge_shmem_orders_madvise = madvise;
5748 huge_shmem_orders_inherit = inherit;
5749 huge_shmem_orders_within_size = within_size;
5750 shmem_orders_configured = true;
5751 return 1;
5752
5753 err:
5754 pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str);
5755 return 0;
5756 }
5757 __setup("thp_shmem=", setup_thp_shmem);
5758
5759 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
5760
5761 #else /* !CONFIG_SHMEM */
5762
5763 /*
5764 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
5765 *
5766 * This is intended for small system where the benefits of the full
5767 * shmem code (swap-backed and resource-limited) are outweighed by
5768 * their complexity. On systems without swap this code should be
5769 * effectively equivalent, but much lighter weight.
5770 */
5771
5772 static struct file_system_type shmem_fs_type = {
5773 .name = "tmpfs",
5774 .init_fs_context = ramfs_init_fs_context,
5775 .parameters = ramfs_fs_parameters,
5776 .kill_sb = ramfs_kill_sb,
5777 .fs_flags = FS_USERNS_MOUNT,
5778 };
5779
shmem_init(void)5780 void __init shmem_init(void)
5781 {
5782 BUG_ON(register_filesystem(&shmem_fs_type) != 0);
5783
5784 shm_mnt = kern_mount(&shmem_fs_type);
5785 BUG_ON(IS_ERR(shm_mnt));
5786 }
5787
shmem_unuse(unsigned int type)5788 int shmem_unuse(unsigned int type)
5789 {
5790 return 0;
5791 }
5792
shmem_lock(struct file * file,int lock,struct ucounts * ucounts)5793 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
5794 {
5795 return 0;
5796 }
5797
shmem_unlock_mapping(struct address_space * mapping)5798 void shmem_unlock_mapping(struct address_space *mapping)
5799 {
5800 }
5801
5802 #ifdef CONFIG_MMU
shmem_get_unmapped_area(struct file * file,unsigned long addr,unsigned long len,unsigned long pgoff,unsigned long flags)5803 unsigned long shmem_get_unmapped_area(struct file *file,
5804 unsigned long addr, unsigned long len,
5805 unsigned long pgoff, unsigned long flags)
5806 {
5807 return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
5808 }
5809 #endif
5810
shmem_truncate_range(struct inode * inode,loff_t lstart,loff_t lend)5811 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
5812 {
5813 truncate_inode_pages_range(inode->i_mapping, lstart, lend);
5814 }
5815 EXPORT_SYMBOL_GPL(shmem_truncate_range);
5816
5817 #define shmem_vm_ops generic_file_vm_ops
5818 #define shmem_anon_vm_ops generic_file_vm_ops
5819 #define shmem_file_operations ramfs_file_operations
5820 #define shmem_acct_size(flags, size) 0
5821 #define shmem_unacct_size(flags, size) do {} while (0)
5822
shmem_get_inode(struct mnt_idmap * idmap,struct super_block * sb,struct inode * dir,umode_t mode,dev_t dev,unsigned long flags)5823 static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
5824 struct super_block *sb, struct inode *dir,
5825 umode_t mode, dev_t dev, unsigned long flags)
5826 {
5827 struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
5828 return inode ? inode : ERR_PTR(-ENOSPC);
5829 }
5830
5831 #endif /* CONFIG_SHMEM */
5832
5833 /* common code */
5834
__shmem_file_setup(struct vfsmount * mnt,const char * name,loff_t size,unsigned long flags,unsigned int i_flags)5835 static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
5836 loff_t size, unsigned long flags, unsigned int i_flags)
5837 {
5838 struct inode *inode;
5839 struct file *res;
5840
5841 if (IS_ERR(mnt))
5842 return ERR_CAST(mnt);
5843
5844 if (size < 0 || size > MAX_LFS_FILESIZE)
5845 return ERR_PTR(-EINVAL);
5846
5847 if (is_idmapped_mnt(mnt))
5848 return ERR_PTR(-EINVAL);
5849
5850 if (shmem_acct_size(flags, size))
5851 return ERR_PTR(-ENOMEM);
5852
5853 inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
5854 S_IFREG | S_IRWXUGO, 0, flags);
5855 if (IS_ERR(inode)) {
5856 shmem_unacct_size(flags, size);
5857 return ERR_CAST(inode);
5858 }
5859 inode->i_flags |= i_flags;
5860 inode->i_size = size;
5861 clear_nlink(inode); /* It is unlinked */
5862 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
5863 if (!IS_ERR(res))
5864 res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
5865 &shmem_file_operations);
5866 if (IS_ERR(res))
5867 iput(inode);
5868 return res;
5869 }
5870
5871 /**
5872 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
5873 * kernel internal. There will be NO LSM permission checks against the
5874 * underlying inode. So users of this interface must do LSM checks at a
5875 * higher layer. The users are the big_key and shm implementations. LSM
5876 * checks are provided at the key or shm level rather than the inode.
5877 * @name: name for dentry (to be seen in /proc/<pid>/maps)
5878 * @size: size to be set for the file
5879 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5880 */
shmem_kernel_file_setup(const char * name,loff_t size,unsigned long flags)5881 struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
5882 {
5883 return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
5884 }
5885 EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
5886
5887 /**
5888 * shmem_file_setup - get an unlinked file living in tmpfs
5889 * @name: name for dentry (to be seen in /proc/<pid>/maps)
5890 * @size: size to be set for the file
5891 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5892 */
shmem_file_setup(const char * name,loff_t size,unsigned long flags)5893 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
5894 {
5895 return __shmem_file_setup(shm_mnt, name, size, flags, 0);
5896 }
5897 EXPORT_SYMBOL_GPL(shmem_file_setup);
5898
5899 /**
5900 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
5901 * @mnt: the tmpfs mount where the file will be created
5902 * @name: name for dentry (to be seen in /proc/<pid>/maps)
5903 * @size: size to be set for the file
5904 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5905 */
shmem_file_setup_with_mnt(struct vfsmount * mnt,const char * name,loff_t size,unsigned long flags)5906 struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
5907 loff_t size, unsigned long flags)
5908 {
5909 return __shmem_file_setup(mnt, name, size, flags, 0);
5910 }
5911 EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
5912
5913 /**
5914 * shmem_zero_setup - setup a shared anonymous mapping
5915 * @vma: the vma to be mmapped is prepared by do_mmap
5916 */
shmem_zero_setup(struct vm_area_struct * vma)5917 int shmem_zero_setup(struct vm_area_struct *vma)
5918 {
5919 struct file *file;
5920 loff_t size = vma->vm_end - vma->vm_start;
5921
5922 /*
5923 * Cloning a new file under mmap_lock leads to a lock ordering conflict
5924 * between XFS directory reading and selinux: since this file is only
5925 * accessible to the user through its mapping, use S_PRIVATE flag to
5926 * bypass file security, in the same way as shmem_kernel_file_setup().
5927 */
5928 file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
5929 if (IS_ERR(file))
5930 return PTR_ERR(file);
5931
5932 if (vma->vm_file)
5933 fput(vma->vm_file);
5934 vma->vm_file = file;
5935 vma->vm_ops = &shmem_anon_vm_ops;
5936
5937 return 0;
5938 }
5939
5940 /**
5941 * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
5942 * @mapping: the folio's address_space
5943 * @index: the folio index
5944 * @gfp: the page allocator flags to use if allocating
5945 *
5946 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
5947 * with any new page allocations done using the specified allocation flags.
5948 * But read_cache_page_gfp() uses the ->read_folio() method: which does not
5949 * suit tmpfs, since it may have pages in swapcache, and needs to find those
5950 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
5951 *
5952 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
5953 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
5954 */
shmem_read_folio_gfp(struct address_space * mapping,pgoff_t index,gfp_t gfp)5955 struct folio *shmem_read_folio_gfp(struct address_space *mapping,
5956 pgoff_t index, gfp_t gfp)
5957 {
5958 #ifdef CONFIG_SHMEM
5959 struct inode *inode = mapping->host;
5960 struct folio *folio;
5961 int error;
5962
5963 error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
5964 gfp, NULL, NULL);
5965 if (error)
5966 return ERR_PTR(error);
5967
5968 folio_unlock(folio);
5969 return folio;
5970 #else
5971 /*
5972 * The tiny !SHMEM case uses ramfs without swap
5973 */
5974 return mapping_read_folio_gfp(mapping, index, gfp);
5975 #endif
5976 }
5977 EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);
5978
shmem_read_mapping_page_gfp(struct address_space * mapping,pgoff_t index,gfp_t gfp)5979 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
5980 pgoff_t index, gfp_t gfp)
5981 {
5982 struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
5983 struct page *page;
5984
5985 if (IS_ERR(folio))
5986 return &folio->page;
5987
5988 page = folio_file_page(folio, index);
5989 if (PageHWPoison(page)) {
5990 folio_put(folio);
5991 return ERR_PTR(-EIO);
5992 }
5993
5994 return page;
5995 }
5996 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
5997