1 /* 2 * Resizable virtual memory filesystem for Linux. 3 * 4 * Copyright (C) 2000 Linus Torvalds. 5 * 2000 Transmeta Corp. 6 * 2000-2001 Christoph Rohland 7 * 2000-2001 SAP AG 8 * 2002 Red Hat Inc. 9 * Copyright (C) 2002-2011 Hugh Dickins. 10 * Copyright (C) 2011 Google Inc. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation. 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs 13 * 14 * Extended attribute support for tmpfs: 15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 17 * 18 * tiny-shmem: 19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 20 * 21 * This file is released under the GPL. 22 */ 23 24 #include <linux/fs.h> 25 #include <linux/init.h> 26 #include <linux/vfs.h> 27 #include <linux/mount.h> 28 #include <linux/ramfs.h> 29 #include <linux/pagemap.h> 30 #include <linux/file.h> 31 #include <linux/fileattr.h> 32 #include <linux/mm.h> 33 #include <linux/random.h> 34 #include <linux/sched/signal.h> 35 #include <linux/export.h> 36 #include <linux/shmem_fs.h> 37 #include <linux/swap.h> 38 #include <linux/uio.h> 39 #include <linux/hugetlb.h> 40 #include <linux/fs_parser.h> 41 #include <linux/swapfile.h> 42 #include <linux/iversion.h> 43 #include "swap.h" 44 45 static struct vfsmount *shm_mnt __ro_after_init; 46 47 #ifdef CONFIG_SHMEM 48 /* 49 * This virtual memory filesystem is heavily based on the ramfs. It 50 * extends ramfs by the ability to use swap and honor resource limits 51 * which makes it a completely usable filesystem. 52 */ 53 54 #include <linux/xattr.h> 55 #include <linux/exportfs.h> 56 #include <linux/posix_acl.h> 57 #include <linux/posix_acl_xattr.h> 58 #include <linux/mman.h> 59 #include <linux/string.h> 60 #include <linux/slab.h> 61 #include <linux/backing-dev.h> 62 #include <linux/writeback.h> 63 #include <linux/pagevec.h> 64 #include <linux/percpu_counter.h> 65 #include <linux/falloc.h> 66 #include <linux/splice.h> 67 #include <linux/security.h> 68 #include <linux/swapops.h> 69 #include <linux/mempolicy.h> 70 #include <linux/namei.h> 71 #include <linux/ctype.h> 72 #include <linux/migrate.h> 73 #include <linux/highmem.h> 74 #include <linux/seq_file.h> 75 #include <linux/magic.h> 76 #include <linux/syscalls.h> 77 #include <linux/fcntl.h> 78 #include <uapi/linux/memfd.h> 79 #include <linux/rmap.h> 80 #include <linux/uuid.h> 81 #include <linux/quotaops.h> 82 #include <linux/rcupdate_wait.h> 83 84 #include <linux/uaccess.h> 85 86 #include "internal.h" 87 88 #define BLOCKS_PER_PAGE (PAGE_SIZE/512) 89 #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) 90 91 /* Pretend that each entry is of this size in directory's i_size */ 92 #define BOGO_DIRENT_SIZE 20 93 94 /* Pretend that one inode + its dentry occupy this much memory */ 95 #define BOGO_INODE_SIZE 1024 96 97 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 98 #define SHORT_SYMLINK_LEN 128 99 100 /* 101 * shmem_fallocate communicates with shmem_fault or shmem_writepage via 102 * inode->i_private (with i_rwsem making sure that it has only one user at 103 * a time): we would prefer not to enlarge the shmem inode just for that. 104 */ 105 struct shmem_falloc { 106 wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ 107 pgoff_t start; /* start of range currently being fallocated */ 108 pgoff_t next; /* the next page offset to be fallocated */ 109 pgoff_t nr_falloced; /* how many new pages have been fallocated */ 110 pgoff_t nr_unswapped; /* how often writepage refused to swap out */ 111 }; 112 113 struct shmem_options { 114 unsigned long long blocks; 115 unsigned long long inodes; 116 struct mempolicy *mpol; 117 kuid_t uid; 118 kgid_t gid; 119 umode_t mode; 120 bool full_inums; 121 int huge; 122 int seen; 123 bool noswap; 124 unsigned short quota_types; 125 struct shmem_quota_limits qlimits; 126 #define SHMEM_SEEN_BLOCKS 1 127 #define SHMEM_SEEN_INODES 2 128 #define SHMEM_SEEN_HUGE 4 129 #define SHMEM_SEEN_INUMS 8 130 #define SHMEM_SEEN_NOSWAP 16 131 #define SHMEM_SEEN_QUOTA 32 132 }; 133 134 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 135 static unsigned long huge_shmem_orders_always __read_mostly; 136 static unsigned long huge_shmem_orders_madvise __read_mostly; 137 static unsigned long huge_shmem_orders_inherit __read_mostly; 138 static unsigned long huge_shmem_orders_within_size __read_mostly; 139 #endif 140 141 #ifdef CONFIG_TMPFS 142 static unsigned long shmem_default_max_blocks(void) 143 { 144 return totalram_pages() / 2; 145 } 146 147 static unsigned long shmem_default_max_inodes(void) 148 { 149 unsigned long nr_pages = totalram_pages(); 150 151 return min3(nr_pages - totalhigh_pages(), nr_pages / 2, 152 ULONG_MAX / BOGO_INODE_SIZE); 153 } 154 #endif 155 156 static int shmem_swapin_folio(struct inode *inode, pgoff_t index, 157 struct folio **foliop, enum sgp_type sgp, gfp_t gfp, 158 struct mm_struct *fault_mm, vm_fault_t *fault_type); 159 160 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 161 { 162 return sb->s_fs_info; 163 } 164 165 /* 166 * shmem_file_setup pre-accounts the whole fixed size of a VM object, 167 * for shared memory and for shared anonymous (/dev/zero) mappings 168 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 169 * consistent with the pre-accounting of private mappings ... 170 */ 171 static inline int shmem_acct_size(unsigned long flags, loff_t size) 172 { 173 return (flags & VM_NORESERVE) ? 174 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); 175 } 176 177 static inline void shmem_unacct_size(unsigned long flags, loff_t size) 178 { 179 if (!(flags & VM_NORESERVE)) 180 vm_unacct_memory(VM_ACCT(size)); 181 } 182 183 static inline int shmem_reacct_size(unsigned long flags, 184 loff_t oldsize, loff_t newsize) 185 { 186 if (!(flags & VM_NORESERVE)) { 187 if (VM_ACCT(newsize) > VM_ACCT(oldsize)) 188 return security_vm_enough_memory_mm(current->mm, 189 VM_ACCT(newsize) - VM_ACCT(oldsize)); 190 else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) 191 vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); 192 } 193 return 0; 194 } 195 196 /* 197 * ... whereas tmpfs objects are accounted incrementally as 198 * pages are allocated, in order to allow large sparse files. 199 * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, 200 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 201 */ 202 static inline int shmem_acct_blocks(unsigned long flags, long pages) 203 { 204 if (!(flags & VM_NORESERVE)) 205 return 0; 206 207 return security_vm_enough_memory_mm(current->mm, 208 pages * VM_ACCT(PAGE_SIZE)); 209 } 210 211 static inline void shmem_unacct_blocks(unsigned long flags, long pages) 212 { 213 if (flags & VM_NORESERVE) 214 vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); 215 } 216 217 static int shmem_inode_acct_blocks(struct inode *inode, long pages) 218 { 219 struct shmem_inode_info *info = SHMEM_I(inode); 220 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 221 int err = -ENOSPC; 222 223 if (shmem_acct_blocks(info->flags, pages)) 224 return err; 225 226 might_sleep(); /* when quotas */ 227 if (sbinfo->max_blocks) { 228 if (!percpu_counter_limited_add(&sbinfo->used_blocks, 229 sbinfo->max_blocks, pages)) 230 goto unacct; 231 232 err = dquot_alloc_block_nodirty(inode, pages); 233 if (err) { 234 percpu_counter_sub(&sbinfo->used_blocks, pages); 235 goto unacct; 236 } 237 } else { 238 err = dquot_alloc_block_nodirty(inode, pages); 239 if (err) 240 goto unacct; 241 } 242 243 return 0; 244 245 unacct: 246 shmem_unacct_blocks(info->flags, pages); 247 return err; 248 } 249 250 static void shmem_inode_unacct_blocks(struct inode *inode, long pages) 251 { 252 struct shmem_inode_info *info = SHMEM_I(inode); 253 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 254 255 might_sleep(); /* when quotas */ 256 dquot_free_block_nodirty(inode, pages); 257 258 if (sbinfo->max_blocks) 259 percpu_counter_sub(&sbinfo->used_blocks, pages); 260 shmem_unacct_blocks(info->flags, pages); 261 } 262 263 static const struct super_operations shmem_ops; 264 static const struct address_space_operations shmem_aops; 265 static const struct file_operations shmem_file_operations; 266 static const struct inode_operations shmem_inode_operations; 267 static const struct inode_operations shmem_dir_inode_operations; 268 static const struct inode_operations shmem_special_inode_operations; 269 static const struct vm_operations_struct shmem_vm_ops; 270 static const struct vm_operations_struct shmem_anon_vm_ops; 271 static struct file_system_type shmem_fs_type; 272 273 bool shmem_mapping(struct address_space *mapping) 274 { 275 return mapping->a_ops == &shmem_aops; 276 } 277 EXPORT_SYMBOL_GPL(shmem_mapping); 278 279 bool vma_is_anon_shmem(struct vm_area_struct *vma) 280 { 281 return vma->vm_ops == &shmem_anon_vm_ops; 282 } 283 284 bool vma_is_shmem(struct vm_area_struct *vma) 285 { 286 return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; 287 } 288 289 static LIST_HEAD(shmem_swaplist); 290 static DEFINE_MUTEX(shmem_swaplist_mutex); 291 292 #ifdef CONFIG_TMPFS_QUOTA 293 294 static int shmem_enable_quotas(struct super_block *sb, 295 unsigned short quota_types) 296 { 297 int type, err = 0; 298 299 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; 300 for (type = 0; type < SHMEM_MAXQUOTAS; type++) { 301 if (!(quota_types & (1 << type))) 302 continue; 303 err = dquot_load_quota_sb(sb, type, QFMT_SHMEM, 304 DQUOT_USAGE_ENABLED | 305 DQUOT_LIMITS_ENABLED); 306 if (err) 307 goto out_err; 308 } 309 return 0; 310 311 out_err: 312 pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n", 313 type, err); 314 for (type--; type >= 0; type--) 315 dquot_quota_off(sb, type); 316 return err; 317 } 318 319 static void shmem_disable_quotas(struct super_block *sb) 320 { 321 int type; 322 323 for (type = 0; type < SHMEM_MAXQUOTAS; type++) 324 dquot_quota_off(sb, type); 325 } 326 327 static struct dquot __rcu **shmem_get_dquots(struct inode *inode) 328 { 329 return SHMEM_I(inode)->i_dquot; 330 } 331 #endif /* CONFIG_TMPFS_QUOTA */ 332 333 /* 334 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and 335 * produces a novel ino for the newly allocated inode. 336 * 337 * It may also be called when making a hard link to permit the space needed by 338 * each dentry. However, in that case, no new inode number is needed since that 339 * internally draws from another pool of inode numbers (currently global 340 * get_next_ino()). This case is indicated by passing NULL as inop. 341 */ 342 #define SHMEM_INO_BATCH 1024 343 static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) 344 { 345 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 346 ino_t ino; 347 348 if (!(sb->s_flags & SB_KERNMOUNT)) { 349 raw_spin_lock(&sbinfo->stat_lock); 350 if (sbinfo->max_inodes) { 351 if (sbinfo->free_ispace < BOGO_INODE_SIZE) { 352 raw_spin_unlock(&sbinfo->stat_lock); 353 return -ENOSPC; 354 } 355 sbinfo->free_ispace -= BOGO_INODE_SIZE; 356 } 357 if (inop) { 358 ino = sbinfo->next_ino++; 359 if (unlikely(is_zero_ino(ino))) 360 ino = sbinfo->next_ino++; 361 if (unlikely(!sbinfo->full_inums && 362 ino > UINT_MAX)) { 363 /* 364 * Emulate get_next_ino uint wraparound for 365 * compatibility 366 */ 367 if (IS_ENABLED(CONFIG_64BIT)) 368 pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", 369 __func__, MINOR(sb->s_dev)); 370 sbinfo->next_ino = 1; 371 ino = sbinfo->next_ino++; 372 } 373 *inop = ino; 374 } 375 raw_spin_unlock(&sbinfo->stat_lock); 376 } else if (inop) { 377 /* 378 * __shmem_file_setup, one of our callers, is lock-free: it 379 * doesn't hold stat_lock in shmem_reserve_inode since 380 * max_inodes is always 0, and is called from potentially 381 * unknown contexts. As such, use a per-cpu batched allocator 382 * which doesn't require the per-sb stat_lock unless we are at 383 * the batch boundary. 384 * 385 * We don't need to worry about inode{32,64} since SB_KERNMOUNT 386 * shmem mounts are not exposed to userspace, so we don't need 387 * to worry about things like glibc compatibility. 388 */ 389 ino_t *next_ino; 390 391 next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); 392 ino = *next_ino; 393 if (unlikely(ino % SHMEM_INO_BATCH == 0)) { 394 raw_spin_lock(&sbinfo->stat_lock); 395 ino = sbinfo->next_ino; 396 sbinfo->next_ino += SHMEM_INO_BATCH; 397 raw_spin_unlock(&sbinfo->stat_lock); 398 if (unlikely(is_zero_ino(ino))) 399 ino++; 400 } 401 *inop = ino; 402 *next_ino = ++ino; 403 put_cpu(); 404 } 405 406 return 0; 407 } 408 409 static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) 410 { 411 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 412 if (sbinfo->max_inodes) { 413 raw_spin_lock(&sbinfo->stat_lock); 414 sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace; 415 raw_spin_unlock(&sbinfo->stat_lock); 416 } 417 } 418 419 /** 420 * shmem_recalc_inode - recalculate the block usage of an inode 421 * @inode: inode to recalc 422 * @alloced: the change in number of pages allocated to inode 423 * @swapped: the change in number of pages swapped from inode 424 * 425 * We have to calculate the free blocks since the mm can drop 426 * undirtied hole pages behind our back. 427 * 428 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 429 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 430 */ 431 static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) 432 { 433 struct shmem_inode_info *info = SHMEM_I(inode); 434 long freed; 435 436 spin_lock(&info->lock); 437 info->alloced += alloced; 438 info->swapped += swapped; 439 freed = info->alloced - info->swapped - 440 READ_ONCE(inode->i_mapping->nrpages); 441 /* 442 * Special case: whereas normally shmem_recalc_inode() is called 443 * after i_mapping->nrpages has already been adjusted (up or down), 444 * shmem_writepage() has to raise swapped before nrpages is lowered - 445 * to stop a racing shmem_recalc_inode() from thinking that a page has 446 * been freed. Compensate here, to avoid the need for a followup call. 447 */ 448 if (swapped > 0) 449 freed += swapped; 450 if (freed > 0) 451 info->alloced -= freed; 452 spin_unlock(&info->lock); 453 454 /* The quota case may block */ 455 if (freed > 0) 456 shmem_inode_unacct_blocks(inode, freed); 457 } 458 459 bool shmem_charge(struct inode *inode, long pages) 460 { 461 struct address_space *mapping = inode->i_mapping; 462 463 if (shmem_inode_acct_blocks(inode, pages)) 464 return false; 465 466 /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ 467 xa_lock_irq(&mapping->i_pages); 468 mapping->nrpages += pages; 469 xa_unlock_irq(&mapping->i_pages); 470 471 shmem_recalc_inode(inode, pages, 0); 472 return true; 473 } 474 475 void shmem_uncharge(struct inode *inode, long pages) 476 { 477 /* pages argument is currently unused: keep it to help debugging */ 478 /* nrpages adjustment done by __filemap_remove_folio() or caller */ 479 480 shmem_recalc_inode(inode, 0, 0); 481 } 482 483 /* 484 * Replace item expected in xarray by a new item, while holding xa_lock. 485 */ 486 static int shmem_replace_entry(struct address_space *mapping, 487 pgoff_t index, void *expected, void *replacement) 488 { 489 XA_STATE(xas, &mapping->i_pages, index); 490 void *item; 491 492 VM_BUG_ON(!expected); 493 VM_BUG_ON(!replacement); 494 item = xas_load(&xas); 495 if (item != expected) 496 return -ENOENT; 497 xas_store(&xas, replacement); 498 return 0; 499 } 500 501 /* 502 * Sometimes, before we decide whether to proceed or to fail, we must check 503 * that an entry was not already brought back from swap by a racing thread. 504 * 505 * Checking page is not enough: by the time a SwapCache page is locked, it 506 * might be reused, and again be SwapCache, using the same swap as before. 507 */ 508 static bool shmem_confirm_swap(struct address_space *mapping, 509 pgoff_t index, swp_entry_t swap) 510 { 511 return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); 512 } 513 514 /* 515 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option 516 * 517 * SHMEM_HUGE_NEVER: 518 * disables huge pages for the mount; 519 * SHMEM_HUGE_ALWAYS: 520 * enables huge pages for the mount; 521 * SHMEM_HUGE_WITHIN_SIZE: 522 * only allocate huge pages if the page will be fully within i_size, 523 * also respect fadvise()/madvise() hints; 524 * SHMEM_HUGE_ADVISE: 525 * only allocate huge pages if requested with fadvise()/madvise(); 526 */ 527 528 #define SHMEM_HUGE_NEVER 0 529 #define SHMEM_HUGE_ALWAYS 1 530 #define SHMEM_HUGE_WITHIN_SIZE 2 531 #define SHMEM_HUGE_ADVISE 3 532 533 /* 534 * Special values. 535 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: 536 * 537 * SHMEM_HUGE_DENY: 538 * disables huge on shm_mnt and all mounts, for emergency use; 539 * SHMEM_HUGE_FORCE: 540 * enables huge on shm_mnt and all mounts, w/o needing option, for testing; 541 * 542 */ 543 #define SHMEM_HUGE_DENY (-1) 544 #define SHMEM_HUGE_FORCE (-2) 545 546 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 547 /* ifdef here to avoid bloating shmem.o when not necessary */ 548 549 static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; 550 551 static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, 552 bool shmem_huge_force, struct vm_area_struct *vma, 553 unsigned long vm_flags) 554 { 555 struct mm_struct *mm = vma ? vma->vm_mm : NULL; 556 loff_t i_size; 557 558 if (!S_ISREG(inode->i_mode)) 559 return false; 560 if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags))) 561 return false; 562 if (shmem_huge == SHMEM_HUGE_DENY) 563 return false; 564 if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) 565 return true; 566 567 switch (SHMEM_SB(inode->i_sb)->huge) { 568 case SHMEM_HUGE_ALWAYS: 569 return true; 570 case SHMEM_HUGE_WITHIN_SIZE: 571 index = round_up(index + 1, HPAGE_PMD_NR); 572 i_size = round_up(i_size_read(inode), PAGE_SIZE); 573 if (i_size >> PAGE_SHIFT >= index) 574 return true; 575 fallthrough; 576 case SHMEM_HUGE_ADVISE: 577 if (mm && (vm_flags & VM_HUGEPAGE)) 578 return true; 579 fallthrough; 580 default: 581 return false; 582 } 583 } 584 585 static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, 586 bool shmem_huge_force, struct vm_area_struct *vma, 587 unsigned long vm_flags) 588 { 589 if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) 590 return false; 591 592 return __shmem_huge_global_enabled(inode, index, shmem_huge_force, 593 vma, vm_flags); 594 } 595 596 #if defined(CONFIG_SYSFS) 597 static int shmem_parse_huge(const char *str) 598 { 599 if (!strcmp(str, "never")) 600 return SHMEM_HUGE_NEVER; 601 if (!strcmp(str, "always")) 602 return SHMEM_HUGE_ALWAYS; 603 if (!strcmp(str, "within_size")) 604 return SHMEM_HUGE_WITHIN_SIZE; 605 if (!strcmp(str, "advise")) 606 return SHMEM_HUGE_ADVISE; 607 if (!strcmp(str, "deny")) 608 return SHMEM_HUGE_DENY; 609 if (!strcmp(str, "force")) 610 return SHMEM_HUGE_FORCE; 611 return -EINVAL; 612 } 613 #endif 614 615 #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) 616 static const char *shmem_format_huge(int huge) 617 { 618 switch (huge) { 619 case SHMEM_HUGE_NEVER: 620 return "never"; 621 case SHMEM_HUGE_ALWAYS: 622 return "always"; 623 case SHMEM_HUGE_WITHIN_SIZE: 624 return "within_size"; 625 case SHMEM_HUGE_ADVISE: 626 return "advise"; 627 case SHMEM_HUGE_DENY: 628 return "deny"; 629 case SHMEM_HUGE_FORCE: 630 return "force"; 631 default: 632 VM_BUG_ON(1); 633 return "bad_val"; 634 } 635 } 636 #endif 637 638 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 639 struct shrink_control *sc, unsigned long nr_to_split) 640 { 641 LIST_HEAD(list), *pos, *next; 642 LIST_HEAD(to_remove); 643 struct inode *inode; 644 struct shmem_inode_info *info; 645 struct folio *folio; 646 unsigned long batch = sc ? sc->nr_to_scan : 128; 647 int split = 0; 648 649 if (list_empty(&sbinfo->shrinklist)) 650 return SHRINK_STOP; 651 652 spin_lock(&sbinfo->shrinklist_lock); 653 list_for_each_safe(pos, next, &sbinfo->shrinklist) { 654 info = list_entry(pos, struct shmem_inode_info, shrinklist); 655 656 /* pin the inode */ 657 inode = igrab(&info->vfs_inode); 658 659 /* inode is about to be evicted */ 660 if (!inode) { 661 list_del_init(&info->shrinklist); 662 goto next; 663 } 664 665 /* Check if there's anything to gain */ 666 if (round_up(inode->i_size, PAGE_SIZE) == 667 round_up(inode->i_size, HPAGE_PMD_SIZE)) { 668 list_move(&info->shrinklist, &to_remove); 669 goto next; 670 } 671 672 list_move(&info->shrinklist, &list); 673 next: 674 sbinfo->shrinklist_len--; 675 if (!--batch) 676 break; 677 } 678 spin_unlock(&sbinfo->shrinklist_lock); 679 680 list_for_each_safe(pos, next, &to_remove) { 681 info = list_entry(pos, struct shmem_inode_info, shrinklist); 682 inode = &info->vfs_inode; 683 list_del_init(&info->shrinklist); 684 iput(inode); 685 } 686 687 list_for_each_safe(pos, next, &list) { 688 int ret; 689 pgoff_t index; 690 691 info = list_entry(pos, struct shmem_inode_info, shrinklist); 692 inode = &info->vfs_inode; 693 694 if (nr_to_split && split >= nr_to_split) 695 goto move_back; 696 697 index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; 698 folio = filemap_get_folio(inode->i_mapping, index); 699 if (IS_ERR(folio)) 700 goto drop; 701 702 /* No huge page at the end of the file: nothing to split */ 703 if (!folio_test_large(folio)) { 704 folio_put(folio); 705 goto drop; 706 } 707 708 /* 709 * Move the inode on the list back to shrinklist if we failed 710 * to lock the page at this time. 711 * 712 * Waiting for the lock may lead to deadlock in the 713 * reclaim path. 714 */ 715 if (!folio_trylock(folio)) { 716 folio_put(folio); 717 goto move_back; 718 } 719 720 ret = split_folio(folio); 721 folio_unlock(folio); 722 folio_put(folio); 723 724 /* If split failed move the inode on the list back to shrinklist */ 725 if (ret) 726 goto move_back; 727 728 split++; 729 drop: 730 list_del_init(&info->shrinklist); 731 goto put; 732 move_back: 733 /* 734 * Make sure the inode is either on the global list or deleted 735 * from any local list before iput() since it could be deleted 736 * in another thread once we put the inode (then the local list 737 * is corrupted). 738 */ 739 spin_lock(&sbinfo->shrinklist_lock); 740 list_move(&info->shrinklist, &sbinfo->shrinklist); 741 sbinfo->shrinklist_len++; 742 spin_unlock(&sbinfo->shrinklist_lock); 743 put: 744 iput(inode); 745 } 746 747 return split; 748 } 749 750 static long shmem_unused_huge_scan(struct super_block *sb, 751 struct shrink_control *sc) 752 { 753 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 754 755 if (!READ_ONCE(sbinfo->shrinklist_len)) 756 return SHRINK_STOP; 757 758 return shmem_unused_huge_shrink(sbinfo, sc, 0); 759 } 760 761 static long shmem_unused_huge_count(struct super_block *sb, 762 struct shrink_control *sc) 763 { 764 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 765 return READ_ONCE(sbinfo->shrinklist_len); 766 } 767 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ 768 769 #define shmem_huge SHMEM_HUGE_DENY 770 771 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 772 struct shrink_control *sc, unsigned long nr_to_split) 773 { 774 return 0; 775 } 776 777 static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, 778 bool shmem_huge_force, struct vm_area_struct *vma, 779 unsigned long vm_flags) 780 { 781 return false; 782 } 783 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 784 785 /* 786 * Somewhat like filemap_add_folio, but error if expected item has gone. 787 */ 788 static int shmem_add_to_page_cache(struct folio *folio, 789 struct address_space *mapping, 790 pgoff_t index, void *expected, gfp_t gfp) 791 { 792 XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); 793 long nr = folio_nr_pages(folio); 794 795 VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); 796 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 797 VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); 798 VM_BUG_ON(expected && folio_test_large(folio)); 799 800 folio_ref_add(folio, nr); 801 folio->mapping = mapping; 802 folio->index = index; 803 804 gfp &= GFP_RECLAIM_MASK; 805 folio_throttle_swaprate(folio, gfp); 806 807 do { 808 xas_lock_irq(&xas); 809 if (expected != xas_find_conflict(&xas)) { 810 xas_set_err(&xas, -EEXIST); 811 goto unlock; 812 } 813 if (expected && xas_find_conflict(&xas)) { 814 xas_set_err(&xas, -EEXIST); 815 goto unlock; 816 } 817 xas_store(&xas, folio); 818 if (xas_error(&xas)) 819 goto unlock; 820 if (folio_test_pmd_mappable(folio)) 821 __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); 822 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); 823 __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); 824 mapping->nrpages += nr; 825 unlock: 826 xas_unlock_irq(&xas); 827 } while (xas_nomem(&xas, gfp)); 828 829 if (xas_error(&xas)) { 830 folio->mapping = NULL; 831 folio_ref_sub(folio, nr); 832 return xas_error(&xas); 833 } 834 835 return 0; 836 } 837 838 /* 839 * Somewhat like filemap_remove_folio, but substitutes swap for @folio. 840 */ 841 static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) 842 { 843 struct address_space *mapping = folio->mapping; 844 long nr = folio_nr_pages(folio); 845 int error; 846 847 xa_lock_irq(&mapping->i_pages); 848 error = shmem_replace_entry(mapping, folio->index, folio, radswap); 849 folio->mapping = NULL; 850 mapping->nrpages -= nr; 851 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 852 __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); 853 xa_unlock_irq(&mapping->i_pages); 854 folio_put(folio); 855 BUG_ON(error); 856 } 857 858 /* 859 * Remove swap entry from page cache, free the swap and its page cache. 860 */ 861 static int shmem_free_swap(struct address_space *mapping, 862 pgoff_t index, void *radswap) 863 { 864 void *old; 865 866 old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); 867 if (old != radswap) 868 return -ENOENT; 869 free_swap_and_cache(radix_to_swp_entry(radswap)); 870 return 0; 871 } 872 873 /* 874 * Determine (in bytes) how many of the shmem object's pages mapped by the 875 * given offsets are swapped out. 876 * 877 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, 878 * as long as the inode doesn't go away and racy results are not a problem. 879 */ 880 unsigned long shmem_partial_swap_usage(struct address_space *mapping, 881 pgoff_t start, pgoff_t end) 882 { 883 XA_STATE(xas, &mapping->i_pages, start); 884 struct page *page; 885 unsigned long swapped = 0; 886 unsigned long max = end - 1; 887 888 rcu_read_lock(); 889 xas_for_each(&xas, page, max) { 890 if (xas_retry(&xas, page)) 891 continue; 892 if (xa_is_value(page)) 893 swapped++; 894 if (xas.xa_index == max) 895 break; 896 if (need_resched()) { 897 xas_pause(&xas); 898 cond_resched_rcu(); 899 } 900 } 901 rcu_read_unlock(); 902 903 return swapped << PAGE_SHIFT; 904 } 905 906 /* 907 * Determine (in bytes) how many of the shmem object's pages mapped by the 908 * given vma is swapped out. 909 * 910 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, 911 * as long as the inode doesn't go away and racy results are not a problem. 912 */ 913 unsigned long shmem_swap_usage(struct vm_area_struct *vma) 914 { 915 struct inode *inode = file_inode(vma->vm_file); 916 struct shmem_inode_info *info = SHMEM_I(inode); 917 struct address_space *mapping = inode->i_mapping; 918 unsigned long swapped; 919 920 /* Be careful as we don't hold info->lock */ 921 swapped = READ_ONCE(info->swapped); 922 923 /* 924 * The easier cases are when the shmem object has nothing in swap, or 925 * the vma maps it whole. Then we can simply use the stats that we 926 * already track. 927 */ 928 if (!swapped) 929 return 0; 930 931 if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) 932 return swapped << PAGE_SHIFT; 933 934 /* Here comes the more involved part */ 935 return shmem_partial_swap_usage(mapping, vma->vm_pgoff, 936 vma->vm_pgoff + vma_pages(vma)); 937 } 938 939 /* 940 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 941 */ 942 void shmem_unlock_mapping(struct address_space *mapping) 943 { 944 struct folio_batch fbatch; 945 pgoff_t index = 0; 946 947 folio_batch_init(&fbatch); 948 /* 949 * Minor point, but we might as well stop if someone else SHM_LOCKs it. 950 */ 951 while (!mapping_unevictable(mapping) && 952 filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { 953 check_move_unevictable_folios(&fbatch); 954 folio_batch_release(&fbatch); 955 cond_resched(); 956 } 957 } 958 959 static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) 960 { 961 struct folio *folio; 962 963 /* 964 * At first avoid shmem_get_folio(,,,SGP_READ): that fails 965 * beyond i_size, and reports fallocated folios as holes. 966 */ 967 folio = filemap_get_entry(inode->i_mapping, index); 968 if (!folio) 969 return folio; 970 if (!xa_is_value(folio)) { 971 folio_lock(folio); 972 if (folio->mapping == inode->i_mapping) 973 return folio; 974 /* The folio has been swapped out */ 975 folio_unlock(folio); 976 folio_put(folio); 977 } 978 /* 979 * But read a folio back from swap if any of it is within i_size 980 * (although in some cases this is just a waste of time). 981 */ 982 folio = NULL; 983 shmem_get_folio(inode, index, &folio, SGP_READ); 984 return folio; 985 } 986 987 /* 988 * Remove range of pages and swap entries from page cache, and free them. 989 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. 990 */ 991 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, 992 bool unfalloc) 993 { 994 struct address_space *mapping = inode->i_mapping; 995 struct shmem_inode_info *info = SHMEM_I(inode); 996 pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; 997 pgoff_t end = (lend + 1) >> PAGE_SHIFT; 998 struct folio_batch fbatch; 999 pgoff_t indices[PAGEVEC_SIZE]; 1000 struct folio *folio; 1001 bool same_folio; 1002 long nr_swaps_freed = 0; 1003 pgoff_t index; 1004 int i; 1005 1006 if (lend == -1) 1007 end = -1; /* unsigned, so actually very big */ 1008 1009 if (info->fallocend > start && info->fallocend <= end && !unfalloc) 1010 info->fallocend = start; 1011 1012 folio_batch_init(&fbatch); 1013 index = start; 1014 while (index < end && find_lock_entries(mapping, &index, end - 1, 1015 &fbatch, indices)) { 1016 for (i = 0; i < folio_batch_count(&fbatch); i++) { 1017 folio = fbatch.folios[i]; 1018 1019 if (xa_is_value(folio)) { 1020 if (unfalloc) 1021 continue; 1022 nr_swaps_freed += !shmem_free_swap(mapping, 1023 indices[i], folio); 1024 continue; 1025 } 1026 1027 if (!unfalloc || !folio_test_uptodate(folio)) 1028 truncate_inode_folio(mapping, folio); 1029 folio_unlock(folio); 1030 } 1031 folio_batch_remove_exceptionals(&fbatch); 1032 folio_batch_release(&fbatch); 1033 cond_resched(); 1034 } 1035 1036 /* 1037 * When undoing a failed fallocate, we want none of the partial folio 1038 * zeroing and splitting below, but shall want to truncate the whole 1039 * folio when !uptodate indicates that it was added by this fallocate, 1040 * even when [lstart, lend] covers only a part of the folio. 1041 */ 1042 if (unfalloc) 1043 goto whole_folios; 1044 1045 same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); 1046 folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); 1047 if (folio) { 1048 same_folio = lend < folio_pos(folio) + folio_size(folio); 1049 folio_mark_dirty(folio); 1050 if (!truncate_inode_partial_folio(folio, lstart, lend)) { 1051 start = folio_next_index(folio); 1052 if (same_folio) 1053 end = folio->index; 1054 } 1055 folio_unlock(folio); 1056 folio_put(folio); 1057 folio = NULL; 1058 } 1059 1060 if (!same_folio) 1061 folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); 1062 if (folio) { 1063 folio_mark_dirty(folio); 1064 if (!truncate_inode_partial_folio(folio, lstart, lend)) 1065 end = folio->index; 1066 folio_unlock(folio); 1067 folio_put(folio); 1068 } 1069 1070 whole_folios: 1071 1072 index = start; 1073 while (index < end) { 1074 cond_resched(); 1075 1076 if (!find_get_entries(mapping, &index, end - 1, &fbatch, 1077 indices)) { 1078 /* If all gone or hole-punch or unfalloc, we're done */ 1079 if (index == start || end != -1) 1080 break; 1081 /* But if truncating, restart to make sure all gone */ 1082 index = start; 1083 continue; 1084 } 1085 for (i = 0; i < folio_batch_count(&fbatch); i++) { 1086 folio = fbatch.folios[i]; 1087 1088 if (xa_is_value(folio)) { 1089 if (unfalloc) 1090 continue; 1091 if (shmem_free_swap(mapping, indices[i], folio)) { 1092 /* Swap was replaced by page: retry */ 1093 index = indices[i]; 1094 break; 1095 } 1096 nr_swaps_freed++; 1097 continue; 1098 } 1099 1100 folio_lock(folio); 1101 1102 if (!unfalloc || !folio_test_uptodate(folio)) { 1103 if (folio_mapping(folio) != mapping) { 1104 /* Page was replaced by swap: retry */ 1105 folio_unlock(folio); 1106 index = indices[i]; 1107 break; 1108 } 1109 VM_BUG_ON_FOLIO(folio_test_writeback(folio), 1110 folio); 1111 1112 if (!folio_test_large(folio)) { 1113 truncate_inode_folio(mapping, folio); 1114 } else if (truncate_inode_partial_folio(folio, lstart, lend)) { 1115 /* 1116 * If we split a page, reset the loop so 1117 * that we pick up the new sub pages. 1118 * Otherwise the THP was entirely 1119 * dropped or the target range was 1120 * zeroed, so just continue the loop as 1121 * is. 1122 */ 1123 if (!folio_test_large(folio)) { 1124 folio_unlock(folio); 1125 index = start; 1126 break; 1127 } 1128 } 1129 } 1130 folio_unlock(folio); 1131 } 1132 folio_batch_remove_exceptionals(&fbatch); 1133 folio_batch_release(&fbatch); 1134 } 1135 1136 shmem_recalc_inode(inode, 0, -nr_swaps_freed); 1137 } 1138 1139 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 1140 { 1141 shmem_undo_range(inode, lstart, lend, false); 1142 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 1143 inode_inc_iversion(inode); 1144 } 1145 EXPORT_SYMBOL_GPL(shmem_truncate_range); 1146 1147 static int shmem_getattr(struct mnt_idmap *idmap, 1148 const struct path *path, struct kstat *stat, 1149 u32 request_mask, unsigned int query_flags) 1150 { 1151 struct inode *inode = path->dentry->d_inode; 1152 struct shmem_inode_info *info = SHMEM_I(inode); 1153 1154 if (info->alloced - info->swapped != inode->i_mapping->nrpages) 1155 shmem_recalc_inode(inode, 0, 0); 1156 1157 if (info->fsflags & FS_APPEND_FL) 1158 stat->attributes |= STATX_ATTR_APPEND; 1159 if (info->fsflags & FS_IMMUTABLE_FL) 1160 stat->attributes |= STATX_ATTR_IMMUTABLE; 1161 if (info->fsflags & FS_NODUMP_FL) 1162 stat->attributes |= STATX_ATTR_NODUMP; 1163 stat->attributes_mask |= (STATX_ATTR_APPEND | 1164 STATX_ATTR_IMMUTABLE | 1165 STATX_ATTR_NODUMP); 1166 generic_fillattr(idmap, request_mask, inode, stat); 1167 1168 if (shmem_huge_global_enabled(inode, 0, false, NULL, 0)) 1169 stat->blksize = HPAGE_PMD_SIZE; 1170 1171 if (request_mask & STATX_BTIME) { 1172 stat->result_mask |= STATX_BTIME; 1173 stat->btime.tv_sec = info->i_crtime.tv_sec; 1174 stat->btime.tv_nsec = info->i_crtime.tv_nsec; 1175 } 1176 1177 return 0; 1178 } 1179 1180 static int shmem_setattr(struct mnt_idmap *idmap, 1181 struct dentry *dentry, struct iattr *attr) 1182 { 1183 struct inode *inode = d_inode(dentry); 1184 struct shmem_inode_info *info = SHMEM_I(inode); 1185 int error; 1186 bool update_mtime = false; 1187 bool update_ctime = true; 1188 1189 error = setattr_prepare(idmap, dentry, attr); 1190 if (error) 1191 return error; 1192 1193 if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { 1194 if ((inode->i_mode ^ attr->ia_mode) & 0111) { 1195 return -EPERM; 1196 } 1197 } 1198 1199 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 1200 loff_t oldsize = inode->i_size; 1201 loff_t newsize = attr->ia_size; 1202 1203 /* protected by i_rwsem */ 1204 if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || 1205 (newsize > oldsize && (info->seals & F_SEAL_GROW))) 1206 return -EPERM; 1207 1208 if (newsize != oldsize) { 1209 error = shmem_reacct_size(SHMEM_I(inode)->flags, 1210 oldsize, newsize); 1211 if (error) 1212 return error; 1213 i_size_write(inode, newsize); 1214 update_mtime = true; 1215 } else { 1216 update_ctime = false; 1217 } 1218 if (newsize <= oldsize) { 1219 loff_t holebegin = round_up(newsize, PAGE_SIZE); 1220 if (oldsize > holebegin) 1221 unmap_mapping_range(inode->i_mapping, 1222 holebegin, 0, 1); 1223 if (info->alloced) 1224 shmem_truncate_range(inode, 1225 newsize, (loff_t)-1); 1226 /* unmap again to remove racily COWed private pages */ 1227 if (oldsize > holebegin) 1228 unmap_mapping_range(inode->i_mapping, 1229 holebegin, 0, 1); 1230 } 1231 } 1232 1233 if (is_quota_modification(idmap, inode, attr)) { 1234 error = dquot_initialize(inode); 1235 if (error) 1236 return error; 1237 } 1238 1239 /* Transfer quota accounting */ 1240 if (i_uid_needs_update(idmap, attr, inode) || 1241 i_gid_needs_update(idmap, attr, inode)) { 1242 error = dquot_transfer(idmap, inode, attr); 1243 if (error) 1244 return error; 1245 } 1246 1247 setattr_copy(idmap, inode, attr); 1248 if (attr->ia_valid & ATTR_MODE) 1249 error = posix_acl_chmod(idmap, dentry, inode->i_mode); 1250 if (!error && update_ctime) { 1251 inode_set_ctime_current(inode); 1252 if (update_mtime) 1253 inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); 1254 inode_inc_iversion(inode); 1255 } 1256 return error; 1257 } 1258 1259 static void shmem_evict_inode(struct inode *inode) 1260 { 1261 struct shmem_inode_info *info = SHMEM_I(inode); 1262 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1263 size_t freed = 0; 1264 1265 if (shmem_mapping(inode->i_mapping)) { 1266 shmem_unacct_size(info->flags, inode->i_size); 1267 inode->i_size = 0; 1268 mapping_set_exiting(inode->i_mapping); 1269 shmem_truncate_range(inode, 0, (loff_t)-1); 1270 if (!list_empty(&info->shrinklist)) { 1271 spin_lock(&sbinfo->shrinklist_lock); 1272 if (!list_empty(&info->shrinklist)) { 1273 list_del_init(&info->shrinklist); 1274 sbinfo->shrinklist_len--; 1275 } 1276 spin_unlock(&sbinfo->shrinklist_lock); 1277 } 1278 while (!list_empty(&info->swaplist)) { 1279 /* Wait while shmem_unuse() is scanning this inode... */ 1280 wait_var_event(&info->stop_eviction, 1281 !atomic_read(&info->stop_eviction)); 1282 mutex_lock(&shmem_swaplist_mutex); 1283 /* ...but beware of the race if we peeked too early */ 1284 if (!atomic_read(&info->stop_eviction)) 1285 list_del_init(&info->swaplist); 1286 mutex_unlock(&shmem_swaplist_mutex); 1287 } 1288 } 1289 1290 simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); 1291 shmem_free_inode(inode->i_sb, freed); 1292 WARN_ON(inode->i_blocks); 1293 clear_inode(inode); 1294 #ifdef CONFIG_TMPFS_QUOTA 1295 dquot_free_inode(inode); 1296 dquot_drop(inode); 1297 #endif 1298 } 1299 1300 static int shmem_find_swap_entries(struct address_space *mapping, 1301 pgoff_t start, struct folio_batch *fbatch, 1302 pgoff_t *indices, unsigned int type) 1303 { 1304 XA_STATE(xas, &mapping->i_pages, start); 1305 struct folio *folio; 1306 swp_entry_t entry; 1307 1308 rcu_read_lock(); 1309 xas_for_each(&xas, folio, ULONG_MAX) { 1310 if (xas_retry(&xas, folio)) 1311 continue; 1312 1313 if (!xa_is_value(folio)) 1314 continue; 1315 1316 entry = radix_to_swp_entry(folio); 1317 /* 1318 * swapin error entries can be found in the mapping. But they're 1319 * deliberately ignored here as we've done everything we can do. 1320 */ 1321 if (swp_type(entry) != type) 1322 continue; 1323 1324 indices[folio_batch_count(fbatch)] = xas.xa_index; 1325 if (!folio_batch_add(fbatch, folio)) 1326 break; 1327 1328 if (need_resched()) { 1329 xas_pause(&xas); 1330 cond_resched_rcu(); 1331 } 1332 } 1333 rcu_read_unlock(); 1334 1335 return xas.xa_index; 1336 } 1337 1338 /* 1339 * Move the swapped pages for an inode to page cache. Returns the count 1340 * of pages swapped in, or the error in case of failure. 1341 */ 1342 static int shmem_unuse_swap_entries(struct inode *inode, 1343 struct folio_batch *fbatch, pgoff_t *indices) 1344 { 1345 int i = 0; 1346 int ret = 0; 1347 int error = 0; 1348 struct address_space *mapping = inode->i_mapping; 1349 1350 for (i = 0; i < folio_batch_count(fbatch); i++) { 1351 struct folio *folio = fbatch->folios[i]; 1352 1353 if (!xa_is_value(folio)) 1354 continue; 1355 error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, 1356 mapping_gfp_mask(mapping), NULL, NULL); 1357 if (error == 0) { 1358 folio_unlock(folio); 1359 folio_put(folio); 1360 ret++; 1361 } 1362 if (error == -ENOMEM) 1363 break; 1364 error = 0; 1365 } 1366 return error ? error : ret; 1367 } 1368 1369 /* 1370 * If swap found in inode, free it and move page from swapcache to filecache. 1371 */ 1372 static int shmem_unuse_inode(struct inode *inode, unsigned int type) 1373 { 1374 struct address_space *mapping = inode->i_mapping; 1375 pgoff_t start = 0; 1376 struct folio_batch fbatch; 1377 pgoff_t indices[PAGEVEC_SIZE]; 1378 int ret = 0; 1379 1380 do { 1381 folio_batch_init(&fbatch); 1382 shmem_find_swap_entries(mapping, start, &fbatch, indices, type); 1383 if (folio_batch_count(&fbatch) == 0) { 1384 ret = 0; 1385 break; 1386 } 1387 1388 ret = shmem_unuse_swap_entries(inode, &fbatch, indices); 1389 if (ret < 0) 1390 break; 1391 1392 start = indices[folio_batch_count(&fbatch) - 1]; 1393 } while (true); 1394 1395 return ret; 1396 } 1397 1398 /* 1399 * Read all the shared memory data that resides in the swap 1400 * device 'type' back into memory, so the swap device can be 1401 * unused. 1402 */ 1403 int shmem_unuse(unsigned int type) 1404 { 1405 struct shmem_inode_info *info, *next; 1406 int error = 0; 1407 1408 if (list_empty(&shmem_swaplist)) 1409 return 0; 1410 1411 mutex_lock(&shmem_swaplist_mutex); 1412 list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { 1413 if (!info->swapped) { 1414 list_del_init(&info->swaplist); 1415 continue; 1416 } 1417 /* 1418 * Drop the swaplist mutex while searching the inode for swap; 1419 * but before doing so, make sure shmem_evict_inode() will not 1420 * remove placeholder inode from swaplist, nor let it be freed 1421 * (igrab() would protect from unlink, but not from unmount). 1422 */ 1423 atomic_inc(&info->stop_eviction); 1424 mutex_unlock(&shmem_swaplist_mutex); 1425 1426 error = shmem_unuse_inode(&info->vfs_inode, type); 1427 cond_resched(); 1428 1429 mutex_lock(&shmem_swaplist_mutex); 1430 next = list_next_entry(info, swaplist); 1431 if (!info->swapped) 1432 list_del_init(&info->swaplist); 1433 if (atomic_dec_and_test(&info->stop_eviction)) 1434 wake_up_var(&info->stop_eviction); 1435 if (error) 1436 break; 1437 } 1438 mutex_unlock(&shmem_swaplist_mutex); 1439 1440 return error; 1441 } 1442 1443 /* 1444 * Move the page from the page cache to the swap cache. 1445 */ 1446 static int shmem_writepage(struct page *page, struct writeback_control *wbc) 1447 { 1448 struct folio *folio = page_folio(page); 1449 struct address_space *mapping = folio->mapping; 1450 struct inode *inode = mapping->host; 1451 struct shmem_inode_info *info = SHMEM_I(inode); 1452 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1453 swp_entry_t swap; 1454 pgoff_t index; 1455 1456 /* 1457 * Our capabilities prevent regular writeback or sync from ever calling 1458 * shmem_writepage; but a stacking filesystem might use ->writepage of 1459 * its underlying filesystem, in which case tmpfs should write out to 1460 * swap only in response to memory pressure, and not for the writeback 1461 * threads or sync. 1462 */ 1463 if (WARN_ON_ONCE(!wbc->for_reclaim)) 1464 goto redirty; 1465 1466 if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap)) 1467 goto redirty; 1468 1469 if (!total_swap_pages) 1470 goto redirty; 1471 1472 /* 1473 * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or 1474 * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, 1475 * and its shmem_writeback() needs them to be split when swapping. 1476 */ 1477 if (folio_test_large(folio)) { 1478 /* Ensure the subpages are still dirty */ 1479 folio_test_set_dirty(folio); 1480 if (split_huge_page(page) < 0) 1481 goto redirty; 1482 folio = page_folio(page); 1483 folio_clear_dirty(folio); 1484 } 1485 1486 index = folio->index; 1487 1488 /* 1489 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC 1490 * value into swapfile.c, the only way we can correctly account for a 1491 * fallocated folio arriving here is now to initialize it and write it. 1492 * 1493 * That's okay for a folio already fallocated earlier, but if we have 1494 * not yet completed the fallocation, then (a) we want to keep track 1495 * of this folio in case we have to undo it, and (b) it may not be a 1496 * good idea to continue anyway, once we're pushing into swap. So 1497 * reactivate the folio, and let shmem_fallocate() quit when too many. 1498 */ 1499 if (!folio_test_uptodate(folio)) { 1500 if (inode->i_private) { 1501 struct shmem_falloc *shmem_falloc; 1502 spin_lock(&inode->i_lock); 1503 shmem_falloc = inode->i_private; 1504 if (shmem_falloc && 1505 !shmem_falloc->waitq && 1506 index >= shmem_falloc->start && 1507 index < shmem_falloc->next) 1508 shmem_falloc->nr_unswapped++; 1509 else 1510 shmem_falloc = NULL; 1511 spin_unlock(&inode->i_lock); 1512 if (shmem_falloc) 1513 goto redirty; 1514 } 1515 folio_zero_range(folio, 0, folio_size(folio)); 1516 flush_dcache_folio(folio); 1517 folio_mark_uptodate(folio); 1518 } 1519 1520 swap = folio_alloc_swap(folio); 1521 if (!swap.val) 1522 goto redirty; 1523 1524 /* 1525 * Add inode to shmem_unuse()'s list of swapped-out inodes, 1526 * if it's not already there. Do it now before the folio is 1527 * moved to swap cache, when its pagelock no longer protects 1528 * the inode from eviction. But don't unlock the mutex until 1529 * we've incremented swapped, because shmem_unuse_inode() will 1530 * prune a !swapped inode from the swaplist under this mutex. 1531 */ 1532 mutex_lock(&shmem_swaplist_mutex); 1533 if (list_empty(&info->swaplist)) 1534 list_add(&info->swaplist, &shmem_swaplist); 1535 1536 if (add_to_swap_cache(folio, swap, 1537 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, 1538 NULL) == 0) { 1539 shmem_recalc_inode(inode, 0, 1); 1540 swap_shmem_alloc(swap); 1541 shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); 1542 1543 mutex_unlock(&shmem_swaplist_mutex); 1544 BUG_ON(folio_mapped(folio)); 1545 return swap_writepage(&folio->page, wbc); 1546 } 1547 1548 mutex_unlock(&shmem_swaplist_mutex); 1549 put_swap_folio(folio, swap); 1550 redirty: 1551 folio_mark_dirty(folio); 1552 if (wbc->for_reclaim) 1553 return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ 1554 folio_unlock(folio); 1555 return 0; 1556 } 1557 1558 #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) 1559 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1560 { 1561 char buffer[64]; 1562 1563 if (!mpol || mpol->mode == MPOL_DEFAULT) 1564 return; /* show nothing */ 1565 1566 mpol_to_str(buffer, sizeof(buffer), mpol); 1567 1568 seq_printf(seq, ",mpol=%s", buffer); 1569 } 1570 1571 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1572 { 1573 struct mempolicy *mpol = NULL; 1574 if (sbinfo->mpol) { 1575 raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 1576 mpol = sbinfo->mpol; 1577 mpol_get(mpol); 1578 raw_spin_unlock(&sbinfo->stat_lock); 1579 } 1580 return mpol; 1581 } 1582 #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ 1583 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1584 { 1585 } 1586 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1587 { 1588 return NULL; 1589 } 1590 #endif /* CONFIG_NUMA && CONFIG_TMPFS */ 1591 1592 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, 1593 pgoff_t index, unsigned int order, pgoff_t *ilx); 1594 1595 static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, 1596 struct shmem_inode_info *info, pgoff_t index) 1597 { 1598 struct mempolicy *mpol; 1599 pgoff_t ilx; 1600 struct folio *folio; 1601 1602 mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); 1603 folio = swap_cluster_readahead(swap, gfp, mpol, ilx); 1604 mpol_cond_put(mpol); 1605 1606 return folio; 1607 } 1608 1609 /* 1610 * Make sure huge_gfp is always more limited than limit_gfp. 1611 * Some of the flags set permissions, while others set limitations. 1612 */ 1613 static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) 1614 { 1615 gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; 1616 gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; 1617 gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; 1618 gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); 1619 1620 /* Allow allocations only from the originally specified zones. */ 1621 result |= zoneflags; 1622 1623 /* 1624 * Minimize the result gfp by taking the union with the deny flags, 1625 * and the intersection of the allow flags. 1626 */ 1627 result |= (limit_gfp & denyflags); 1628 result |= (huge_gfp & limit_gfp) & allowflags; 1629 1630 return result; 1631 } 1632 1633 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1634 unsigned long shmem_allowable_huge_orders(struct inode *inode, 1635 struct vm_area_struct *vma, pgoff_t index, 1636 bool shmem_huge_force) 1637 { 1638 unsigned long mask = READ_ONCE(huge_shmem_orders_always); 1639 unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); 1640 unsigned long vm_flags = vma ? vma->vm_flags : 0; 1641 bool global_huge; 1642 loff_t i_size; 1643 int order; 1644 1645 if (vma && ((vm_flags & VM_NOHUGEPAGE) || 1646 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) 1647 return 0; 1648 1649 /* If the hardware/firmware marked hugepage support disabled. */ 1650 if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) 1651 return 0; 1652 1653 global_huge = shmem_huge_global_enabled(inode, index, shmem_huge_force, 1654 vma, vm_flags); 1655 if (!vma || !vma_is_anon_shmem(vma)) { 1656 /* 1657 * For tmpfs, we now only support PMD sized THP if huge page 1658 * is enabled, otherwise fallback to order 0. 1659 */ 1660 return global_huge ? BIT(HPAGE_PMD_ORDER) : 0; 1661 } 1662 1663 /* 1664 * Following the 'deny' semantics of the top level, force the huge 1665 * option off from all mounts. 1666 */ 1667 if (shmem_huge == SHMEM_HUGE_DENY) 1668 return 0; 1669 1670 /* 1671 * Only allow inherit orders if the top-level value is 'force', which 1672 * means non-PMD sized THP can not override 'huge' mount option now. 1673 */ 1674 if (shmem_huge == SHMEM_HUGE_FORCE) 1675 return READ_ONCE(huge_shmem_orders_inherit); 1676 1677 /* Allow mTHP that will be fully within i_size. */ 1678 order = highest_order(within_size_orders); 1679 while (within_size_orders) { 1680 index = round_up(index + 1, order); 1681 i_size = round_up(i_size_read(inode), PAGE_SIZE); 1682 if (i_size >> PAGE_SHIFT >= index) { 1683 mask |= within_size_orders; 1684 break; 1685 } 1686 1687 order = next_order(&within_size_orders, order); 1688 } 1689 1690 if (vm_flags & VM_HUGEPAGE) 1691 mask |= READ_ONCE(huge_shmem_orders_madvise); 1692 1693 if (global_huge) 1694 mask |= READ_ONCE(huge_shmem_orders_inherit); 1695 1696 return THP_ORDERS_ALL_FILE_DEFAULT & mask; 1697 } 1698 1699 static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, 1700 struct address_space *mapping, pgoff_t index, 1701 unsigned long orders) 1702 { 1703 struct vm_area_struct *vma = vmf ? vmf->vma : NULL; 1704 pgoff_t aligned_index; 1705 unsigned long pages; 1706 int order; 1707 1708 if (vma) { 1709 orders = thp_vma_suitable_orders(vma, vmf->address, orders); 1710 if (!orders) 1711 return 0; 1712 } 1713 1714 /* Find the highest order that can add into the page cache */ 1715 order = highest_order(orders); 1716 while (orders) { 1717 pages = 1UL << order; 1718 aligned_index = round_down(index, pages); 1719 /* 1720 * Check for conflict before waiting on a huge allocation. 1721 * Conflict might be that a huge page has just been allocated 1722 * and added to page cache by a racing thread, or that there 1723 * is already at least one small page in the huge extent. 1724 * Be careful to retry when appropriate, but not forever! 1725 * Elsewhere -EEXIST would be the right code, but not here. 1726 */ 1727 if (!xa_find(&mapping->i_pages, &aligned_index, 1728 aligned_index + pages - 1, XA_PRESENT)) 1729 break; 1730 order = next_order(&orders, order); 1731 } 1732 1733 return orders; 1734 } 1735 #else 1736 static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, 1737 struct address_space *mapping, pgoff_t index, 1738 unsigned long orders) 1739 { 1740 return 0; 1741 } 1742 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1743 1744 static struct folio *shmem_alloc_folio(gfp_t gfp, int order, 1745 struct shmem_inode_info *info, pgoff_t index) 1746 { 1747 struct mempolicy *mpol; 1748 pgoff_t ilx; 1749 struct folio *folio; 1750 1751 mpol = shmem_get_pgoff_policy(info, index, order, &ilx); 1752 folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); 1753 mpol_cond_put(mpol); 1754 1755 return folio; 1756 } 1757 1758 static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, 1759 gfp_t gfp, struct inode *inode, pgoff_t index, 1760 struct mm_struct *fault_mm, unsigned long orders) 1761 { 1762 struct address_space *mapping = inode->i_mapping; 1763 struct shmem_inode_info *info = SHMEM_I(inode); 1764 unsigned long suitable_orders = 0; 1765 struct folio *folio = NULL; 1766 long pages; 1767 int error, order; 1768 1769 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1770 orders = 0; 1771 1772 if (orders > 0) { 1773 suitable_orders = shmem_suitable_orders(inode, vmf, 1774 mapping, index, orders); 1775 1776 order = highest_order(suitable_orders); 1777 while (suitable_orders) { 1778 pages = 1UL << order; 1779 index = round_down(index, pages); 1780 folio = shmem_alloc_folio(gfp, order, info, index); 1781 if (folio) 1782 goto allocated; 1783 1784 if (pages == HPAGE_PMD_NR) 1785 count_vm_event(THP_FILE_FALLBACK); 1786 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1787 count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); 1788 #endif 1789 order = next_order(&suitable_orders, order); 1790 } 1791 } else { 1792 pages = 1; 1793 folio = shmem_alloc_folio(gfp, 0, info, index); 1794 } 1795 if (!folio) 1796 return ERR_PTR(-ENOMEM); 1797 1798 allocated: 1799 __folio_set_locked(folio); 1800 __folio_set_swapbacked(folio); 1801 1802 gfp &= GFP_RECLAIM_MASK; 1803 error = mem_cgroup_charge(folio, fault_mm, gfp); 1804 if (error) { 1805 if (xa_find(&mapping->i_pages, &index, 1806 index + pages - 1, XA_PRESENT)) { 1807 error = -EEXIST; 1808 } else if (pages > 1) { 1809 if (pages == HPAGE_PMD_NR) { 1810 count_vm_event(THP_FILE_FALLBACK); 1811 count_vm_event(THP_FILE_FALLBACK_CHARGE); 1812 } 1813 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1814 count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); 1815 count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); 1816 #endif 1817 } 1818 goto unlock; 1819 } 1820 1821 error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp); 1822 if (error) 1823 goto unlock; 1824 1825 error = shmem_inode_acct_blocks(inode, pages); 1826 if (error) { 1827 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1828 long freed; 1829 /* 1830 * Try to reclaim some space by splitting a few 1831 * large folios beyond i_size on the filesystem. 1832 */ 1833 shmem_unused_huge_shrink(sbinfo, NULL, 2); 1834 /* 1835 * And do a shmem_recalc_inode() to account for freed pages: 1836 * except our folio is there in cache, so not quite balanced. 1837 */ 1838 spin_lock(&info->lock); 1839 freed = pages + info->alloced - info->swapped - 1840 READ_ONCE(mapping->nrpages); 1841 if (freed > 0) 1842 info->alloced -= freed; 1843 spin_unlock(&info->lock); 1844 if (freed > 0) 1845 shmem_inode_unacct_blocks(inode, freed); 1846 error = shmem_inode_acct_blocks(inode, pages); 1847 if (error) { 1848 filemap_remove_folio(folio); 1849 goto unlock; 1850 } 1851 } 1852 1853 shmem_recalc_inode(inode, pages, 0); 1854 folio_add_lru(folio); 1855 return folio; 1856 1857 unlock: 1858 folio_unlock(folio); 1859 folio_put(folio); 1860 return ERR_PTR(error); 1861 } 1862 1863 /* 1864 * When a page is moved from swapcache to shmem filecache (either by the 1865 * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of 1866 * shmem_unuse_inode()), it may have been read in earlier from swap, in 1867 * ignorance of the mapping it belongs to. If that mapping has special 1868 * constraints (like the gma500 GEM driver, which requires RAM below 4GB), 1869 * we may need to copy to a suitable page before moving to filecache. 1870 * 1871 * In a future release, this may well be extended to respect cpuset and 1872 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); 1873 * but for now it is a simple matter of zone. 1874 */ 1875 static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) 1876 { 1877 return folio_zonenum(folio) > gfp_zone(gfp); 1878 } 1879 1880 static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, 1881 struct shmem_inode_info *info, pgoff_t index) 1882 { 1883 struct folio *old, *new; 1884 struct address_space *swap_mapping; 1885 swp_entry_t entry; 1886 pgoff_t swap_index; 1887 int error; 1888 1889 old = *foliop; 1890 entry = old->swap; 1891 swap_index = swap_cache_index(entry); 1892 swap_mapping = swap_address_space(entry); 1893 1894 /* 1895 * We have arrived here because our zones are constrained, so don't 1896 * limit chance of success by further cpuset and node constraints. 1897 */ 1898 gfp &= ~GFP_CONSTRAINT_MASK; 1899 VM_BUG_ON_FOLIO(folio_test_large(old), old); 1900 new = shmem_alloc_folio(gfp, 0, info, index); 1901 if (!new) 1902 return -ENOMEM; 1903 1904 folio_get(new); 1905 folio_copy(new, old); 1906 flush_dcache_folio(new); 1907 1908 __folio_set_locked(new); 1909 __folio_set_swapbacked(new); 1910 folio_mark_uptodate(new); 1911 new->swap = entry; 1912 folio_set_swapcache(new); 1913 1914 /* 1915 * Our caller will very soon move newpage out of swapcache, but it's 1916 * a nice clean interface for us to replace oldpage by newpage there. 1917 */ 1918 xa_lock_irq(&swap_mapping->i_pages); 1919 error = shmem_replace_entry(swap_mapping, swap_index, old, new); 1920 if (!error) { 1921 mem_cgroup_replace_folio(old, new); 1922 __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); 1923 __lruvec_stat_mod_folio(new, NR_SHMEM, 1); 1924 __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); 1925 __lruvec_stat_mod_folio(old, NR_SHMEM, -1); 1926 } 1927 xa_unlock_irq(&swap_mapping->i_pages); 1928 1929 if (unlikely(error)) { 1930 /* 1931 * Is this possible? I think not, now that our callers check 1932 * both PageSwapCache and page_private after getting page lock; 1933 * but be defensive. Reverse old to newpage for clear and free. 1934 */ 1935 old = new; 1936 } else { 1937 folio_add_lru(new); 1938 *foliop = new; 1939 } 1940 1941 folio_clear_swapcache(old); 1942 old->private = NULL; 1943 1944 folio_unlock(old); 1945 folio_put_refs(old, 2); 1946 return error; 1947 } 1948 1949 static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, 1950 struct folio *folio, swp_entry_t swap) 1951 { 1952 struct address_space *mapping = inode->i_mapping; 1953 swp_entry_t swapin_error; 1954 void *old; 1955 1956 swapin_error = make_poisoned_swp_entry(); 1957 old = xa_cmpxchg_irq(&mapping->i_pages, index, 1958 swp_to_radix_entry(swap), 1959 swp_to_radix_entry(swapin_error), 0); 1960 if (old != swp_to_radix_entry(swap)) 1961 return; 1962 1963 folio_wait_writeback(folio); 1964 delete_from_swap_cache(folio); 1965 /* 1966 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks 1967 * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) 1968 * in shmem_evict_inode(). 1969 */ 1970 shmem_recalc_inode(inode, -1, -1); 1971 swap_free(swap); 1972 } 1973 1974 /* 1975 * Swap in the folio pointed to by *foliop. 1976 * Caller has to make sure that *foliop contains a valid swapped folio. 1977 * Returns 0 and the folio in foliop if success. On failure, returns the 1978 * error code and NULL in *foliop. 1979 */ 1980 static int shmem_swapin_folio(struct inode *inode, pgoff_t index, 1981 struct folio **foliop, enum sgp_type sgp, 1982 gfp_t gfp, struct mm_struct *fault_mm, 1983 vm_fault_t *fault_type) 1984 { 1985 struct address_space *mapping = inode->i_mapping; 1986 struct shmem_inode_info *info = SHMEM_I(inode); 1987 struct swap_info_struct *si; 1988 struct folio *folio = NULL; 1989 swp_entry_t swap; 1990 int error; 1991 1992 VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); 1993 swap = radix_to_swp_entry(*foliop); 1994 *foliop = NULL; 1995 1996 if (is_poisoned_swp_entry(swap)) 1997 return -EIO; 1998 1999 si = get_swap_device(swap); 2000 if (!si) { 2001 if (!shmem_confirm_swap(mapping, index, swap)) 2002 return -EEXIST; 2003 else 2004 return -EINVAL; 2005 } 2006 2007 /* Look it up and read it in.. */ 2008 folio = swap_cache_get_folio(swap, NULL, 0); 2009 if (!folio) { 2010 /* Or update major stats only when swapin succeeds?? */ 2011 if (fault_type) { 2012 *fault_type |= VM_FAULT_MAJOR; 2013 count_vm_event(PGMAJFAULT); 2014 count_memcg_event_mm(fault_mm, PGMAJFAULT); 2015 } 2016 /* Here we actually start the io */ 2017 folio = shmem_swapin_cluster(swap, gfp, info, index); 2018 if (!folio) { 2019 error = -ENOMEM; 2020 goto failed; 2021 } 2022 } 2023 2024 /* We have to do this with folio locked to prevent races */ 2025 folio_lock(folio); 2026 if (!folio_test_swapcache(folio) || 2027 folio->swap.val != swap.val || 2028 !shmem_confirm_swap(mapping, index, swap)) { 2029 error = -EEXIST; 2030 goto unlock; 2031 } 2032 if (!folio_test_uptodate(folio)) { 2033 error = -EIO; 2034 goto failed; 2035 } 2036 folio_wait_writeback(folio); 2037 2038 /* 2039 * Some architectures may have to restore extra metadata to the 2040 * folio after reading from swap. 2041 */ 2042 arch_swap_restore(folio_swap(swap, folio), folio); 2043 2044 if (shmem_should_replace_folio(folio, gfp)) { 2045 error = shmem_replace_folio(&folio, gfp, info, index); 2046 if (error) 2047 goto failed; 2048 } 2049 2050 error = shmem_add_to_page_cache(folio, mapping, index, 2051 swp_to_radix_entry(swap), gfp); 2052 if (error) 2053 goto failed; 2054 2055 shmem_recalc_inode(inode, 0, -1); 2056 2057 if (sgp == SGP_WRITE) 2058 folio_mark_accessed(folio); 2059 2060 delete_from_swap_cache(folio); 2061 folio_mark_dirty(folio); 2062 swap_free(swap); 2063 put_swap_device(si); 2064 2065 *foliop = folio; 2066 return 0; 2067 failed: 2068 if (!shmem_confirm_swap(mapping, index, swap)) 2069 error = -EEXIST; 2070 if (error == -EIO) 2071 shmem_set_folio_swapin_error(inode, index, folio, swap); 2072 unlock: 2073 if (folio) { 2074 folio_unlock(folio); 2075 folio_put(folio); 2076 } 2077 put_swap_device(si); 2078 2079 return error; 2080 } 2081 2082 /* 2083 * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate 2084 * 2085 * If we allocate a new one we do not mark it dirty. That's up to the 2086 * vm. If we swap it in we mark it dirty since we also free the swap 2087 * entry since a page cannot live in both the swap and page cache. 2088 * 2089 * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. 2090 */ 2091 static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, 2092 struct folio **foliop, enum sgp_type sgp, gfp_t gfp, 2093 struct vm_fault *vmf, vm_fault_t *fault_type) 2094 { 2095 struct vm_area_struct *vma = vmf ? vmf->vma : NULL; 2096 struct mm_struct *fault_mm; 2097 struct folio *folio; 2098 int error; 2099 bool alloced; 2100 unsigned long orders = 0; 2101 2102 if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) 2103 return -EINVAL; 2104 2105 if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) 2106 return -EFBIG; 2107 repeat: 2108 if (sgp <= SGP_CACHE && 2109 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) 2110 return -EINVAL; 2111 2112 alloced = false; 2113 fault_mm = vma ? vma->vm_mm : NULL; 2114 2115 folio = filemap_get_entry(inode->i_mapping, index); 2116 if (folio && vma && userfaultfd_minor(vma)) { 2117 if (!xa_is_value(folio)) 2118 folio_put(folio); 2119 *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); 2120 return 0; 2121 } 2122 2123 if (xa_is_value(folio)) { 2124 error = shmem_swapin_folio(inode, index, &folio, 2125 sgp, gfp, fault_mm, fault_type); 2126 if (error == -EEXIST) 2127 goto repeat; 2128 2129 *foliop = folio; 2130 return error; 2131 } 2132 2133 if (folio) { 2134 folio_lock(folio); 2135 2136 /* Has the folio been truncated or swapped out? */ 2137 if (unlikely(folio->mapping != inode->i_mapping)) { 2138 folio_unlock(folio); 2139 folio_put(folio); 2140 goto repeat; 2141 } 2142 if (sgp == SGP_WRITE) 2143 folio_mark_accessed(folio); 2144 if (folio_test_uptodate(folio)) 2145 goto out; 2146 /* fallocated folio */ 2147 if (sgp != SGP_READ) 2148 goto clear; 2149 folio_unlock(folio); 2150 folio_put(folio); 2151 } 2152 2153 /* 2154 * SGP_READ: succeed on hole, with NULL folio, letting caller zero. 2155 * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. 2156 */ 2157 *foliop = NULL; 2158 if (sgp == SGP_READ) 2159 return 0; 2160 if (sgp == SGP_NOALLOC) 2161 return -ENOENT; 2162 2163 /* 2164 * Fast cache lookup and swap lookup did not find it: allocate. 2165 */ 2166 2167 if (vma && userfaultfd_missing(vma)) { 2168 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); 2169 return 0; 2170 } 2171 2172 /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ 2173 orders = shmem_allowable_huge_orders(inode, vma, index, false); 2174 if (orders > 0) { 2175 gfp_t huge_gfp; 2176 2177 huge_gfp = vma_thp_gfp_mask(vma); 2178 huge_gfp = limit_gfp_mask(huge_gfp, gfp); 2179 folio = shmem_alloc_and_add_folio(vmf, huge_gfp, 2180 inode, index, fault_mm, orders); 2181 if (!IS_ERR(folio)) { 2182 if (folio_test_pmd_mappable(folio)) 2183 count_vm_event(THP_FILE_ALLOC); 2184 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2185 count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); 2186 #endif 2187 goto alloced; 2188 } 2189 if (PTR_ERR(folio) == -EEXIST) 2190 goto repeat; 2191 } 2192 2193 folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0); 2194 if (IS_ERR(folio)) { 2195 error = PTR_ERR(folio); 2196 if (error == -EEXIST) 2197 goto repeat; 2198 folio = NULL; 2199 goto unlock; 2200 } 2201 2202 alloced: 2203 alloced = true; 2204 if (folio_test_large(folio) && 2205 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < 2206 folio_next_index(folio) - 1) { 2207 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 2208 struct shmem_inode_info *info = SHMEM_I(inode); 2209 /* 2210 * Part of the large folio is beyond i_size: subject 2211 * to shrink under memory pressure. 2212 */ 2213 spin_lock(&sbinfo->shrinklist_lock); 2214 /* 2215 * _careful to defend against unlocked access to 2216 * ->shrink_list in shmem_unused_huge_shrink() 2217 */ 2218 if (list_empty_careful(&info->shrinklist)) { 2219 list_add_tail(&info->shrinklist, 2220 &sbinfo->shrinklist); 2221 sbinfo->shrinklist_len++; 2222 } 2223 spin_unlock(&sbinfo->shrinklist_lock); 2224 } 2225 2226 if (sgp == SGP_WRITE) 2227 folio_set_referenced(folio); 2228 /* 2229 * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. 2230 */ 2231 if (sgp == SGP_FALLOC) 2232 sgp = SGP_WRITE; 2233 clear: 2234 /* 2235 * Let SGP_WRITE caller clear ends if write does not fill folio; 2236 * but SGP_FALLOC on a folio fallocated earlier must initialize 2237 * it now, lest undo on failure cancel our earlier guarantee. 2238 */ 2239 if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { 2240 long i, n = folio_nr_pages(folio); 2241 2242 for (i = 0; i < n; i++) 2243 clear_highpage(folio_page(folio, i)); 2244 flush_dcache_folio(folio); 2245 folio_mark_uptodate(folio); 2246 } 2247 2248 /* Perhaps the file has been truncated since we checked */ 2249 if (sgp <= SGP_CACHE && 2250 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 2251 error = -EINVAL; 2252 goto unlock; 2253 } 2254 out: 2255 *foliop = folio; 2256 return 0; 2257 2258 /* 2259 * Error recovery. 2260 */ 2261 unlock: 2262 if (alloced) 2263 filemap_remove_folio(folio); 2264 shmem_recalc_inode(inode, 0, 0); 2265 if (folio) { 2266 folio_unlock(folio); 2267 folio_put(folio); 2268 } 2269 return error; 2270 } 2271 2272 /** 2273 * shmem_get_folio - find, and lock a shmem folio. 2274 * @inode: inode to search 2275 * @index: the page index. 2276 * @foliop: pointer to the folio if found 2277 * @sgp: SGP_* flags to control behavior 2278 * 2279 * Looks up the page cache entry at @inode & @index. If a folio is 2280 * present, it is returned locked with an increased refcount. 2281 * 2282 * If the caller modifies data in the folio, it must call folio_mark_dirty() 2283 * before unlocking the folio to ensure that the folio is not reclaimed. 2284 * There is no need to reserve space before calling folio_mark_dirty(). 2285 * 2286 * When no folio is found, the behavior depends on @sgp: 2287 * - for SGP_READ, *@foliop is %NULL and 0 is returned 2288 * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned 2289 * - for all other flags a new folio is allocated, inserted into the 2290 * page cache and returned locked in @foliop. 2291 * 2292 * Context: May sleep. 2293 * Return: 0 if successful, else a negative error code. 2294 */ 2295 int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, 2296 enum sgp_type sgp) 2297 { 2298 return shmem_get_folio_gfp(inode, index, foliop, sgp, 2299 mapping_gfp_mask(inode->i_mapping), NULL, NULL); 2300 } 2301 EXPORT_SYMBOL_GPL(shmem_get_folio); 2302 2303 /* 2304 * This is like autoremove_wake_function, but it removes the wait queue 2305 * entry unconditionally - even if something else had already woken the 2306 * target. 2307 */ 2308 static int synchronous_wake_function(wait_queue_entry_t *wait, 2309 unsigned int mode, int sync, void *key) 2310 { 2311 int ret = default_wake_function(wait, mode, sync, key); 2312 list_del_init(&wait->entry); 2313 return ret; 2314 } 2315 2316 /* 2317 * Trinity finds that probing a hole which tmpfs is punching can 2318 * prevent the hole-punch from ever completing: which in turn 2319 * locks writers out with its hold on i_rwsem. So refrain from 2320 * faulting pages into the hole while it's being punched. Although 2321 * shmem_undo_range() does remove the additions, it may be unable to 2322 * keep up, as each new page needs its own unmap_mapping_range() call, 2323 * and the i_mmap tree grows ever slower to scan if new vmas are added. 2324 * 2325 * It does not matter if we sometimes reach this check just before the 2326 * hole-punch begins, so that one fault then races with the punch: 2327 * we just need to make racing faults a rare case. 2328 * 2329 * The implementation below would be much simpler if we just used a 2330 * standard mutex or completion: but we cannot take i_rwsem in fault, 2331 * and bloating every shmem inode for this unlikely case would be sad. 2332 */ 2333 static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) 2334 { 2335 struct shmem_falloc *shmem_falloc; 2336 struct file *fpin = NULL; 2337 vm_fault_t ret = 0; 2338 2339 spin_lock(&inode->i_lock); 2340 shmem_falloc = inode->i_private; 2341 if (shmem_falloc && 2342 shmem_falloc->waitq && 2343 vmf->pgoff >= shmem_falloc->start && 2344 vmf->pgoff < shmem_falloc->next) { 2345 wait_queue_head_t *shmem_falloc_waitq; 2346 DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); 2347 2348 ret = VM_FAULT_NOPAGE; 2349 fpin = maybe_unlock_mmap_for_io(vmf, NULL); 2350 shmem_falloc_waitq = shmem_falloc->waitq; 2351 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, 2352 TASK_UNINTERRUPTIBLE); 2353 spin_unlock(&inode->i_lock); 2354 schedule(); 2355 2356 /* 2357 * shmem_falloc_waitq points into the shmem_fallocate() 2358 * stack of the hole-punching task: shmem_falloc_waitq 2359 * is usually invalid by the time we reach here, but 2360 * finish_wait() does not dereference it in that case; 2361 * though i_lock needed lest racing with wake_up_all(). 2362 */ 2363 spin_lock(&inode->i_lock); 2364 finish_wait(shmem_falloc_waitq, &shmem_fault_wait); 2365 } 2366 spin_unlock(&inode->i_lock); 2367 if (fpin) { 2368 fput(fpin); 2369 ret = VM_FAULT_RETRY; 2370 } 2371 return ret; 2372 } 2373 2374 static vm_fault_t shmem_fault(struct vm_fault *vmf) 2375 { 2376 struct inode *inode = file_inode(vmf->vma->vm_file); 2377 gfp_t gfp = mapping_gfp_mask(inode->i_mapping); 2378 struct folio *folio = NULL; 2379 vm_fault_t ret = 0; 2380 int err; 2381 2382 /* 2383 * Trinity finds that probing a hole which tmpfs is punching can 2384 * prevent the hole-punch from ever completing: noted in i_private. 2385 */ 2386 if (unlikely(inode->i_private)) { 2387 ret = shmem_falloc_wait(vmf, inode); 2388 if (ret) 2389 return ret; 2390 } 2391 2392 WARN_ON_ONCE(vmf->page != NULL); 2393 err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, 2394 gfp, vmf, &ret); 2395 if (err) 2396 return vmf_error(err); 2397 if (folio) { 2398 vmf->page = folio_file_page(folio, vmf->pgoff); 2399 ret |= VM_FAULT_LOCKED; 2400 } 2401 return ret; 2402 } 2403 2404 unsigned long shmem_get_unmapped_area(struct file *file, 2405 unsigned long uaddr, unsigned long len, 2406 unsigned long pgoff, unsigned long flags) 2407 { 2408 unsigned long addr; 2409 unsigned long offset; 2410 unsigned long inflated_len; 2411 unsigned long inflated_addr; 2412 unsigned long inflated_offset; 2413 unsigned long hpage_size; 2414 2415 if (len > TASK_SIZE) 2416 return -ENOMEM; 2417 2418 addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, 2419 flags); 2420 2421 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 2422 return addr; 2423 if (IS_ERR_VALUE(addr)) 2424 return addr; 2425 if (addr & ~PAGE_MASK) 2426 return addr; 2427 if (addr > TASK_SIZE - len) 2428 return addr; 2429 2430 if (shmem_huge == SHMEM_HUGE_DENY) 2431 return addr; 2432 if (flags & MAP_FIXED) 2433 return addr; 2434 /* 2435 * Our priority is to support MAP_SHARED mapped hugely; 2436 * and support MAP_PRIVATE mapped hugely too, until it is COWed. 2437 * But if caller specified an address hint and we allocated area there 2438 * successfully, respect that as before. 2439 */ 2440 if (uaddr == addr) 2441 return addr; 2442 2443 hpage_size = HPAGE_PMD_SIZE; 2444 if (shmem_huge != SHMEM_HUGE_FORCE) { 2445 struct super_block *sb; 2446 unsigned long __maybe_unused hpage_orders; 2447 int order = 0; 2448 2449 if (file) { 2450 VM_BUG_ON(file->f_op != &shmem_file_operations); 2451 sb = file_inode(file)->i_sb; 2452 } else { 2453 /* 2454 * Called directly from mm/mmap.c, or drivers/char/mem.c 2455 * for "/dev/zero", to create a shared anonymous object. 2456 */ 2457 if (IS_ERR(shm_mnt)) 2458 return addr; 2459 sb = shm_mnt->mnt_sb; 2460 2461 /* 2462 * Find the highest mTHP order used for anonymous shmem to 2463 * provide a suitable alignment address. 2464 */ 2465 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2466 hpage_orders = READ_ONCE(huge_shmem_orders_always); 2467 hpage_orders |= READ_ONCE(huge_shmem_orders_within_size); 2468 hpage_orders |= READ_ONCE(huge_shmem_orders_madvise); 2469 if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) 2470 hpage_orders |= READ_ONCE(huge_shmem_orders_inherit); 2471 2472 if (hpage_orders > 0) { 2473 order = highest_order(hpage_orders); 2474 hpage_size = PAGE_SIZE << order; 2475 } 2476 #endif 2477 } 2478 if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order) 2479 return addr; 2480 } 2481 2482 if (len < hpage_size) 2483 return addr; 2484 2485 offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1); 2486 if (offset && offset + len < 2 * hpage_size) 2487 return addr; 2488 if ((addr & (hpage_size - 1)) == offset) 2489 return addr; 2490 2491 inflated_len = len + hpage_size - PAGE_SIZE; 2492 if (inflated_len > TASK_SIZE) 2493 return addr; 2494 if (inflated_len < len) 2495 return addr; 2496 2497 inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, 2498 inflated_len, 0, flags); 2499 if (IS_ERR_VALUE(inflated_addr)) 2500 return addr; 2501 if (inflated_addr & ~PAGE_MASK) 2502 return addr; 2503 2504 inflated_offset = inflated_addr & (hpage_size - 1); 2505 inflated_addr += offset - inflated_offset; 2506 if (inflated_offset > offset) 2507 inflated_addr += hpage_size; 2508 2509 if (inflated_addr > TASK_SIZE - len) 2510 return addr; 2511 return inflated_addr; 2512 } 2513 2514 #ifdef CONFIG_NUMA 2515 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 2516 { 2517 struct inode *inode = file_inode(vma->vm_file); 2518 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); 2519 } 2520 2521 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 2522 unsigned long addr, pgoff_t *ilx) 2523 { 2524 struct inode *inode = file_inode(vma->vm_file); 2525 pgoff_t index; 2526 2527 /* 2528 * Bias interleave by inode number to distribute better across nodes; 2529 * but this interface is independent of which page order is used, so 2530 * supplies only that bias, letting caller apply the offset (adjusted 2531 * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). 2532 */ 2533 *ilx = inode->i_ino; 2534 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2535 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); 2536 } 2537 2538 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, 2539 pgoff_t index, unsigned int order, pgoff_t *ilx) 2540 { 2541 struct mempolicy *mpol; 2542 2543 /* Bias interleave by inode number to distribute better across nodes */ 2544 *ilx = info->vfs_inode.i_ino + (index >> order); 2545 2546 mpol = mpol_shared_policy_lookup(&info->policy, index); 2547 return mpol ? mpol : get_task_policy(current); 2548 } 2549 #else 2550 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, 2551 pgoff_t index, unsigned int order, pgoff_t *ilx) 2552 { 2553 *ilx = 0; 2554 return NULL; 2555 } 2556 #endif /* CONFIG_NUMA */ 2557 2558 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 2559 { 2560 struct inode *inode = file_inode(file); 2561 struct shmem_inode_info *info = SHMEM_I(inode); 2562 int retval = -ENOMEM; 2563 2564 /* 2565 * What serializes the accesses to info->flags? 2566 * ipc_lock_object() when called from shmctl_do_lock(), 2567 * no serialization needed when called from shm_destroy(). 2568 */ 2569 if (lock && !(info->flags & VM_LOCKED)) { 2570 if (!user_shm_lock(inode->i_size, ucounts)) 2571 goto out_nomem; 2572 info->flags |= VM_LOCKED; 2573 mapping_set_unevictable(file->f_mapping); 2574 } 2575 if (!lock && (info->flags & VM_LOCKED) && ucounts) { 2576 user_shm_unlock(inode->i_size, ucounts); 2577 info->flags &= ~VM_LOCKED; 2578 mapping_clear_unevictable(file->f_mapping); 2579 } 2580 retval = 0; 2581 2582 out_nomem: 2583 return retval; 2584 } 2585 2586 static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 2587 { 2588 struct inode *inode = file_inode(file); 2589 struct shmem_inode_info *info = SHMEM_I(inode); 2590 int ret; 2591 2592 ret = seal_check_write(info->seals, vma); 2593 if (ret) 2594 return ret; 2595 2596 /* arm64 - allow memory tagging on RAM-based files */ 2597 vm_flags_set(vma, VM_MTE_ALLOWED); 2598 2599 file_accessed(file); 2600 /* This is anonymous shared memory if it is unlinked at the time of mmap */ 2601 if (inode->i_nlink) 2602 vma->vm_ops = &shmem_vm_ops; 2603 else 2604 vma->vm_ops = &shmem_anon_vm_ops; 2605 return 0; 2606 } 2607 2608 static int shmem_file_open(struct inode *inode, struct file *file) 2609 { 2610 file->f_mode |= FMODE_CAN_ODIRECT; 2611 return generic_file_open(inode, file); 2612 } 2613 2614 #ifdef CONFIG_TMPFS_XATTR 2615 static int shmem_initxattrs(struct inode *, const struct xattr *, void *); 2616 2617 /* 2618 * chattr's fsflags are unrelated to extended attributes, 2619 * but tmpfs has chosen to enable them under the same config option. 2620 */ 2621 static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) 2622 { 2623 unsigned int i_flags = 0; 2624 2625 if (fsflags & FS_NOATIME_FL) 2626 i_flags |= S_NOATIME; 2627 if (fsflags & FS_APPEND_FL) 2628 i_flags |= S_APPEND; 2629 if (fsflags & FS_IMMUTABLE_FL) 2630 i_flags |= S_IMMUTABLE; 2631 /* 2632 * But FS_NODUMP_FL does not require any action in i_flags. 2633 */ 2634 inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); 2635 } 2636 #else 2637 static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) 2638 { 2639 } 2640 #define shmem_initxattrs NULL 2641 #endif 2642 2643 static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) 2644 { 2645 return &SHMEM_I(inode)->dir_offsets; 2646 } 2647 2648 static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, 2649 struct super_block *sb, 2650 struct inode *dir, umode_t mode, 2651 dev_t dev, unsigned long flags) 2652 { 2653 struct inode *inode; 2654 struct shmem_inode_info *info; 2655 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2656 ino_t ino; 2657 int err; 2658 2659 err = shmem_reserve_inode(sb, &ino); 2660 if (err) 2661 return ERR_PTR(err); 2662 2663 inode = new_inode(sb); 2664 if (!inode) { 2665 shmem_free_inode(sb, 0); 2666 return ERR_PTR(-ENOSPC); 2667 } 2668 2669 inode->i_ino = ino; 2670 inode_init_owner(idmap, inode, dir, mode); 2671 inode->i_blocks = 0; 2672 simple_inode_init_ts(inode); 2673 inode->i_generation = get_random_u32(); 2674 info = SHMEM_I(inode); 2675 memset(info, 0, (char *)inode - (char *)info); 2676 spin_lock_init(&info->lock); 2677 atomic_set(&info->stop_eviction, 0); 2678 info->seals = F_SEAL_SEAL; 2679 info->flags = flags & VM_NORESERVE; 2680 info->i_crtime = inode_get_mtime(inode); 2681 info->fsflags = (dir == NULL) ? 0 : 2682 SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; 2683 if (info->fsflags) 2684 shmem_set_inode_flags(inode, info->fsflags); 2685 INIT_LIST_HEAD(&info->shrinklist); 2686 INIT_LIST_HEAD(&info->swaplist); 2687 simple_xattrs_init(&info->xattrs); 2688 cache_no_acl(inode); 2689 if (sbinfo->noswap) 2690 mapping_set_unevictable(inode->i_mapping); 2691 mapping_set_large_folios(inode->i_mapping); 2692 2693 switch (mode & S_IFMT) { 2694 default: 2695 inode->i_op = &shmem_special_inode_operations; 2696 init_special_inode(inode, mode, dev); 2697 break; 2698 case S_IFREG: 2699 inode->i_mapping->a_ops = &shmem_aops; 2700 inode->i_op = &shmem_inode_operations; 2701 inode->i_fop = &shmem_file_operations; 2702 mpol_shared_policy_init(&info->policy, 2703 shmem_get_sbmpol(sbinfo)); 2704 break; 2705 case S_IFDIR: 2706 inc_nlink(inode); 2707 /* Some things misbehave if size == 0 on a directory */ 2708 inode->i_size = 2 * BOGO_DIRENT_SIZE; 2709 inode->i_op = &shmem_dir_inode_operations; 2710 inode->i_fop = &simple_offset_dir_operations; 2711 simple_offset_init(shmem_get_offset_ctx(inode)); 2712 break; 2713 case S_IFLNK: 2714 /* 2715 * Must not load anything in the rbtree, 2716 * mpol_free_shared_policy will not be called. 2717 */ 2718 mpol_shared_policy_init(&info->policy, NULL); 2719 break; 2720 } 2721 2722 lockdep_annotate_inode_mutex_key(inode); 2723 return inode; 2724 } 2725 2726 #ifdef CONFIG_TMPFS_QUOTA 2727 static struct inode *shmem_get_inode(struct mnt_idmap *idmap, 2728 struct super_block *sb, struct inode *dir, 2729 umode_t mode, dev_t dev, unsigned long flags) 2730 { 2731 int err; 2732 struct inode *inode; 2733 2734 inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags); 2735 if (IS_ERR(inode)) 2736 return inode; 2737 2738 err = dquot_initialize(inode); 2739 if (err) 2740 goto errout; 2741 2742 err = dquot_alloc_inode(inode); 2743 if (err) { 2744 dquot_drop(inode); 2745 goto errout; 2746 } 2747 return inode; 2748 2749 errout: 2750 inode->i_flags |= S_NOQUOTA; 2751 iput(inode); 2752 return ERR_PTR(err); 2753 } 2754 #else 2755 static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, 2756 struct super_block *sb, struct inode *dir, 2757 umode_t mode, dev_t dev, unsigned long flags) 2758 { 2759 return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); 2760 } 2761 #endif /* CONFIG_TMPFS_QUOTA */ 2762 2763 #ifdef CONFIG_USERFAULTFD 2764 int shmem_mfill_atomic_pte(pmd_t *dst_pmd, 2765 struct vm_area_struct *dst_vma, 2766 unsigned long dst_addr, 2767 unsigned long src_addr, 2768 uffd_flags_t flags, 2769 struct folio **foliop) 2770 { 2771 struct inode *inode = file_inode(dst_vma->vm_file); 2772 struct shmem_inode_info *info = SHMEM_I(inode); 2773 struct address_space *mapping = inode->i_mapping; 2774 gfp_t gfp = mapping_gfp_mask(mapping); 2775 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 2776 void *page_kaddr; 2777 struct folio *folio; 2778 int ret; 2779 pgoff_t max_off; 2780 2781 if (shmem_inode_acct_blocks(inode, 1)) { 2782 /* 2783 * We may have got a page, returned -ENOENT triggering a retry, 2784 * and now we find ourselves with -ENOMEM. Release the page, to 2785 * avoid a BUG_ON in our caller. 2786 */ 2787 if (unlikely(*foliop)) { 2788 folio_put(*foliop); 2789 *foliop = NULL; 2790 } 2791 return -ENOMEM; 2792 } 2793 2794 if (!*foliop) { 2795 ret = -ENOMEM; 2796 folio = shmem_alloc_folio(gfp, 0, info, pgoff); 2797 if (!folio) 2798 goto out_unacct_blocks; 2799 2800 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { 2801 page_kaddr = kmap_local_folio(folio, 0); 2802 /* 2803 * The read mmap_lock is held here. Despite the 2804 * mmap_lock being read recursive a deadlock is still 2805 * possible if a writer has taken a lock. For example: 2806 * 2807 * process A thread 1 takes read lock on own mmap_lock 2808 * process A thread 2 calls mmap, blocks taking write lock 2809 * process B thread 1 takes page fault, read lock on own mmap lock 2810 * process B thread 2 calls mmap, blocks taking write lock 2811 * process A thread 1 blocks taking read lock on process B 2812 * process B thread 1 blocks taking read lock on process A 2813 * 2814 * Disable page faults to prevent potential deadlock 2815 * and retry the copy outside the mmap_lock. 2816 */ 2817 pagefault_disable(); 2818 ret = copy_from_user(page_kaddr, 2819 (const void __user *)src_addr, 2820 PAGE_SIZE); 2821 pagefault_enable(); 2822 kunmap_local(page_kaddr); 2823 2824 /* fallback to copy_from_user outside mmap_lock */ 2825 if (unlikely(ret)) { 2826 *foliop = folio; 2827 ret = -ENOENT; 2828 /* don't free the page */ 2829 goto out_unacct_blocks; 2830 } 2831 2832 flush_dcache_folio(folio); 2833 } else { /* ZEROPAGE */ 2834 clear_user_highpage(&folio->page, dst_addr); 2835 } 2836 } else { 2837 folio = *foliop; 2838 VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 2839 *foliop = NULL; 2840 } 2841 2842 VM_BUG_ON(folio_test_locked(folio)); 2843 VM_BUG_ON(folio_test_swapbacked(folio)); 2844 __folio_set_locked(folio); 2845 __folio_set_swapbacked(folio); 2846 __folio_mark_uptodate(folio); 2847 2848 ret = -EFAULT; 2849 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2850 if (unlikely(pgoff >= max_off)) 2851 goto out_release; 2852 2853 ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); 2854 if (ret) 2855 goto out_release; 2856 ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); 2857 if (ret) 2858 goto out_release; 2859 2860 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 2861 &folio->page, true, flags); 2862 if (ret) 2863 goto out_delete_from_cache; 2864 2865 shmem_recalc_inode(inode, 1, 0); 2866 folio_unlock(folio); 2867 return 0; 2868 out_delete_from_cache: 2869 filemap_remove_folio(folio); 2870 out_release: 2871 folio_unlock(folio); 2872 folio_put(folio); 2873 out_unacct_blocks: 2874 shmem_inode_unacct_blocks(inode, 1); 2875 return ret; 2876 } 2877 #endif /* CONFIG_USERFAULTFD */ 2878 2879 #ifdef CONFIG_TMPFS 2880 static const struct inode_operations shmem_symlink_inode_operations; 2881 static const struct inode_operations shmem_short_symlink_operations; 2882 2883 static int 2884 shmem_write_begin(struct file *file, struct address_space *mapping, 2885 loff_t pos, unsigned len, 2886 struct page **pagep, void **fsdata) 2887 { 2888 struct inode *inode = mapping->host; 2889 struct shmem_inode_info *info = SHMEM_I(inode); 2890 pgoff_t index = pos >> PAGE_SHIFT; 2891 struct folio *folio; 2892 int ret = 0; 2893 2894 /* i_rwsem is held by caller */ 2895 if (unlikely(info->seals & (F_SEAL_GROW | 2896 F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { 2897 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) 2898 return -EPERM; 2899 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 2900 return -EPERM; 2901 } 2902 2903 ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); 2904 if (ret) 2905 return ret; 2906 2907 *pagep = folio_file_page(folio, index); 2908 if (PageHWPoison(*pagep)) { 2909 folio_unlock(folio); 2910 folio_put(folio); 2911 *pagep = NULL; 2912 return -EIO; 2913 } 2914 2915 return 0; 2916 } 2917 2918 static int 2919 shmem_write_end(struct file *file, struct address_space *mapping, 2920 loff_t pos, unsigned len, unsigned copied, 2921 struct page *page, void *fsdata) 2922 { 2923 struct folio *folio = page_folio(page); 2924 struct inode *inode = mapping->host; 2925 2926 if (pos + copied > inode->i_size) 2927 i_size_write(inode, pos + copied); 2928 2929 if (!folio_test_uptodate(folio)) { 2930 if (copied < folio_size(folio)) { 2931 size_t from = offset_in_folio(folio, pos); 2932 folio_zero_segments(folio, 0, from, 2933 from + copied, folio_size(folio)); 2934 } 2935 folio_mark_uptodate(folio); 2936 } 2937 folio_mark_dirty(folio); 2938 folio_unlock(folio); 2939 folio_put(folio); 2940 2941 return copied; 2942 } 2943 2944 static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 2945 { 2946 struct file *file = iocb->ki_filp; 2947 struct inode *inode = file_inode(file); 2948 struct address_space *mapping = inode->i_mapping; 2949 pgoff_t index; 2950 unsigned long offset; 2951 int error = 0; 2952 ssize_t retval = 0; 2953 loff_t *ppos = &iocb->ki_pos; 2954 2955 index = *ppos >> PAGE_SHIFT; 2956 offset = *ppos & ~PAGE_MASK; 2957 2958 for (;;) { 2959 struct folio *folio = NULL; 2960 struct page *page = NULL; 2961 pgoff_t end_index; 2962 unsigned long nr, ret; 2963 loff_t i_size = i_size_read(inode); 2964 2965 end_index = i_size >> PAGE_SHIFT; 2966 if (index > end_index) 2967 break; 2968 if (index == end_index) { 2969 nr = i_size & ~PAGE_MASK; 2970 if (nr <= offset) 2971 break; 2972 } 2973 2974 error = shmem_get_folio(inode, index, &folio, SGP_READ); 2975 if (error) { 2976 if (error == -EINVAL) 2977 error = 0; 2978 break; 2979 } 2980 if (folio) { 2981 folio_unlock(folio); 2982 2983 page = folio_file_page(folio, index); 2984 if (PageHWPoison(page)) { 2985 folio_put(folio); 2986 error = -EIO; 2987 break; 2988 } 2989 } 2990 2991 /* 2992 * We must evaluate after, since reads (unlike writes) 2993 * are called without i_rwsem protection against truncate 2994 */ 2995 nr = PAGE_SIZE; 2996 i_size = i_size_read(inode); 2997 end_index = i_size >> PAGE_SHIFT; 2998 if (index == end_index) { 2999 nr = i_size & ~PAGE_MASK; 3000 if (nr <= offset) { 3001 if (folio) 3002 folio_put(folio); 3003 break; 3004 } 3005 } 3006 nr -= offset; 3007 3008 if (folio) { 3009 /* 3010 * If users can be writing to this page using arbitrary 3011 * virtual addresses, take care about potential aliasing 3012 * before reading the page on the kernel side. 3013 */ 3014 if (mapping_writably_mapped(mapping)) 3015 flush_dcache_page(page); 3016 /* 3017 * Mark the page accessed if we read the beginning. 3018 */ 3019 if (!offset) 3020 folio_mark_accessed(folio); 3021 /* 3022 * Ok, we have the page, and it's up-to-date, so 3023 * now we can copy it to user space... 3024 */ 3025 ret = copy_page_to_iter(page, offset, nr, to); 3026 folio_put(folio); 3027 3028 } else if (user_backed_iter(to)) { 3029 /* 3030 * Copy to user tends to be so well optimized, but 3031 * clear_user() not so much, that it is noticeably 3032 * faster to copy the zero page instead of clearing. 3033 */ 3034 ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); 3035 } else { 3036 /* 3037 * But submitting the same page twice in a row to 3038 * splice() - or others? - can result in confusion: 3039 * so don't attempt that optimization on pipes etc. 3040 */ 3041 ret = iov_iter_zero(nr, to); 3042 } 3043 3044 retval += ret; 3045 offset += ret; 3046 index += offset >> PAGE_SHIFT; 3047 offset &= ~PAGE_MASK; 3048 3049 if (!iov_iter_count(to)) 3050 break; 3051 if (ret < nr) { 3052 error = -EFAULT; 3053 break; 3054 } 3055 cond_resched(); 3056 } 3057 3058 *ppos = ((loff_t) index << PAGE_SHIFT) + offset; 3059 file_accessed(file); 3060 return retval ? retval : error; 3061 } 3062 3063 static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 3064 { 3065 struct file *file = iocb->ki_filp; 3066 struct inode *inode = file->f_mapping->host; 3067 ssize_t ret; 3068 3069 inode_lock(inode); 3070 ret = generic_write_checks(iocb, from); 3071 if (ret <= 0) 3072 goto unlock; 3073 ret = file_remove_privs(file); 3074 if (ret) 3075 goto unlock; 3076 ret = file_update_time(file); 3077 if (ret) 3078 goto unlock; 3079 ret = generic_perform_write(iocb, from); 3080 unlock: 3081 inode_unlock(inode); 3082 return ret; 3083 } 3084 3085 static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, 3086 struct pipe_buffer *buf) 3087 { 3088 return true; 3089 } 3090 3091 static void zero_pipe_buf_release(struct pipe_inode_info *pipe, 3092 struct pipe_buffer *buf) 3093 { 3094 } 3095 3096 static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe, 3097 struct pipe_buffer *buf) 3098 { 3099 return false; 3100 } 3101 3102 static const struct pipe_buf_operations zero_pipe_buf_ops = { 3103 .release = zero_pipe_buf_release, 3104 .try_steal = zero_pipe_buf_try_steal, 3105 .get = zero_pipe_buf_get, 3106 }; 3107 3108 static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, 3109 loff_t fpos, size_t size) 3110 { 3111 size_t offset = fpos & ~PAGE_MASK; 3112 3113 size = min_t(size_t, size, PAGE_SIZE - offset); 3114 3115 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { 3116 struct pipe_buffer *buf = pipe_head_buf(pipe); 3117 3118 *buf = (struct pipe_buffer) { 3119 .ops = &zero_pipe_buf_ops, 3120 .page = ZERO_PAGE(0), 3121 .offset = offset, 3122 .len = size, 3123 }; 3124 pipe->head++; 3125 } 3126 3127 return size; 3128 } 3129 3130 static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, 3131 struct pipe_inode_info *pipe, 3132 size_t len, unsigned int flags) 3133 { 3134 struct inode *inode = file_inode(in); 3135 struct address_space *mapping = inode->i_mapping; 3136 struct folio *folio = NULL; 3137 size_t total_spliced = 0, used, npages, n, part; 3138 loff_t isize; 3139 int error = 0; 3140 3141 /* Work out how much data we can actually add into the pipe */ 3142 used = pipe_occupancy(pipe->head, pipe->tail); 3143 npages = max_t(ssize_t, pipe->max_usage - used, 0); 3144 len = min_t(size_t, len, npages * PAGE_SIZE); 3145 3146 do { 3147 if (*ppos >= i_size_read(inode)) 3148 break; 3149 3150 error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, 3151 SGP_READ); 3152 if (error) { 3153 if (error == -EINVAL) 3154 error = 0; 3155 break; 3156 } 3157 if (folio) { 3158 folio_unlock(folio); 3159 3160 if (folio_test_hwpoison(folio) || 3161 (folio_test_large(folio) && 3162 folio_test_has_hwpoisoned(folio))) { 3163 error = -EIO; 3164 break; 3165 } 3166 } 3167 3168 /* 3169 * i_size must be checked after we know the pages are Uptodate. 3170 * 3171 * Checking i_size after the check allows us to calculate 3172 * the correct value for "nr", which means the zero-filled 3173 * part of the page is not copied back to userspace (unless 3174 * another truncate extends the file - this is desired though). 3175 */ 3176 isize = i_size_read(inode); 3177 if (unlikely(*ppos >= isize)) 3178 break; 3179 part = min_t(loff_t, isize - *ppos, len); 3180 3181 if (folio) { 3182 /* 3183 * If users can be writing to this page using arbitrary 3184 * virtual addresses, take care about potential aliasing 3185 * before reading the page on the kernel side. 3186 */ 3187 if (mapping_writably_mapped(mapping)) 3188 flush_dcache_folio(folio); 3189 folio_mark_accessed(folio); 3190 /* 3191 * Ok, we have the page, and it's up-to-date, so we can 3192 * now splice it into the pipe. 3193 */ 3194 n = splice_folio_into_pipe(pipe, folio, *ppos, part); 3195 folio_put(folio); 3196 folio = NULL; 3197 } else { 3198 n = splice_zeropage_into_pipe(pipe, *ppos, part); 3199 } 3200 3201 if (!n) 3202 break; 3203 len -= n; 3204 total_spliced += n; 3205 *ppos += n; 3206 in->f_ra.prev_pos = *ppos; 3207 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 3208 break; 3209 3210 cond_resched(); 3211 } while (len); 3212 3213 if (folio) 3214 folio_put(folio); 3215 3216 file_accessed(in); 3217 return total_spliced ? total_spliced : error; 3218 } 3219 3220 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) 3221 { 3222 struct address_space *mapping = file->f_mapping; 3223 struct inode *inode = mapping->host; 3224 3225 if (whence != SEEK_DATA && whence != SEEK_HOLE) 3226 return generic_file_llseek_size(file, offset, whence, 3227 MAX_LFS_FILESIZE, i_size_read(inode)); 3228 if (offset < 0) 3229 return -ENXIO; 3230 3231 inode_lock(inode); 3232 /* We're holding i_rwsem so we can access i_size directly */ 3233 offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); 3234 if (offset >= 0) 3235 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); 3236 inode_unlock(inode); 3237 return offset; 3238 } 3239 3240 static long shmem_fallocate(struct file *file, int mode, loff_t offset, 3241 loff_t len) 3242 { 3243 struct inode *inode = file_inode(file); 3244 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 3245 struct shmem_inode_info *info = SHMEM_I(inode); 3246 struct shmem_falloc shmem_falloc; 3247 pgoff_t start, index, end, undo_fallocend; 3248 int error; 3249 3250 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 3251 return -EOPNOTSUPP; 3252 3253 inode_lock(inode); 3254 3255 if (mode & FALLOC_FL_PUNCH_HOLE) { 3256 struct address_space *mapping = file->f_mapping; 3257 loff_t unmap_start = round_up(offset, PAGE_SIZE); 3258 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 3259 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 3260 3261 /* protected by i_rwsem */ 3262 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { 3263 error = -EPERM; 3264 goto out; 3265 } 3266 3267 shmem_falloc.waitq = &shmem_falloc_waitq; 3268 shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; 3269 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; 3270 spin_lock(&inode->i_lock); 3271 inode->i_private = &shmem_falloc; 3272 spin_unlock(&inode->i_lock); 3273 3274 if ((u64)unmap_end > (u64)unmap_start) 3275 unmap_mapping_range(mapping, unmap_start, 3276 1 + unmap_end - unmap_start, 0); 3277 shmem_truncate_range(inode, offset, offset + len - 1); 3278 /* No need to unmap again: hole-punching leaves COWed pages */ 3279 3280 spin_lock(&inode->i_lock); 3281 inode->i_private = NULL; 3282 wake_up_all(&shmem_falloc_waitq); 3283 WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); 3284 spin_unlock(&inode->i_lock); 3285 error = 0; 3286 goto out; 3287 } 3288 3289 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 3290 error = inode_newsize_ok(inode, offset + len); 3291 if (error) 3292 goto out; 3293 3294 if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { 3295 error = -EPERM; 3296 goto out; 3297 } 3298 3299 start = offset >> PAGE_SHIFT; 3300 end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 3301 /* Try to avoid a swapstorm if len is impossible to satisfy */ 3302 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { 3303 error = -ENOSPC; 3304 goto out; 3305 } 3306 3307 shmem_falloc.waitq = NULL; 3308 shmem_falloc.start = start; 3309 shmem_falloc.next = start; 3310 shmem_falloc.nr_falloced = 0; 3311 shmem_falloc.nr_unswapped = 0; 3312 spin_lock(&inode->i_lock); 3313 inode->i_private = &shmem_falloc; 3314 spin_unlock(&inode->i_lock); 3315 3316 /* 3317 * info->fallocend is only relevant when huge pages might be 3318 * involved: to prevent split_huge_page() freeing fallocated 3319 * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. 3320 */ 3321 undo_fallocend = info->fallocend; 3322 if (info->fallocend < end) 3323 info->fallocend = end; 3324 3325 for (index = start; index < end; ) { 3326 struct folio *folio; 3327 3328 /* 3329 * Check for fatal signal so that we abort early in OOM 3330 * situations. We don't want to abort in case of non-fatal 3331 * signals as large fallocate can take noticeable time and 3332 * e.g. periodic timers may result in fallocate constantly 3333 * restarting. 3334 */ 3335 if (fatal_signal_pending(current)) 3336 error = -EINTR; 3337 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) 3338 error = -ENOMEM; 3339 else 3340 error = shmem_get_folio(inode, index, &folio, 3341 SGP_FALLOC); 3342 if (error) { 3343 info->fallocend = undo_fallocend; 3344 /* Remove the !uptodate folios we added */ 3345 if (index > start) { 3346 shmem_undo_range(inode, 3347 (loff_t)start << PAGE_SHIFT, 3348 ((loff_t)index << PAGE_SHIFT) - 1, true); 3349 } 3350 goto undone; 3351 } 3352 3353 /* 3354 * Here is a more important optimization than it appears: 3355 * a second SGP_FALLOC on the same large folio will clear it, 3356 * making it uptodate and un-undoable if we fail later. 3357 */ 3358 index = folio_next_index(folio); 3359 /* Beware 32-bit wraparound */ 3360 if (!index) 3361 index--; 3362 3363 /* 3364 * Inform shmem_writepage() how far we have reached. 3365 * No need for lock or barrier: we have the page lock. 3366 */ 3367 if (!folio_test_uptodate(folio)) 3368 shmem_falloc.nr_falloced += index - shmem_falloc.next; 3369 shmem_falloc.next = index; 3370 3371 /* 3372 * If !uptodate, leave it that way so that freeable folios 3373 * can be recognized if we need to rollback on error later. 3374 * But mark it dirty so that memory pressure will swap rather 3375 * than free the folios we are allocating (and SGP_CACHE folios 3376 * might still be clean: we now need to mark those dirty too). 3377 */ 3378 folio_mark_dirty(folio); 3379 folio_unlock(folio); 3380 folio_put(folio); 3381 cond_resched(); 3382 } 3383 3384 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) 3385 i_size_write(inode, offset + len); 3386 undone: 3387 spin_lock(&inode->i_lock); 3388 inode->i_private = NULL; 3389 spin_unlock(&inode->i_lock); 3390 out: 3391 if (!error) 3392 file_modified(file); 3393 inode_unlock(inode); 3394 return error; 3395 } 3396 3397 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 3398 { 3399 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 3400 3401 buf->f_type = TMPFS_MAGIC; 3402 buf->f_bsize = PAGE_SIZE; 3403 buf->f_namelen = NAME_MAX; 3404 if (sbinfo->max_blocks) { 3405 buf->f_blocks = sbinfo->max_blocks; 3406 buf->f_bavail = 3407 buf->f_bfree = sbinfo->max_blocks - 3408 percpu_counter_sum(&sbinfo->used_blocks); 3409 } 3410 if (sbinfo->max_inodes) { 3411 buf->f_files = sbinfo->max_inodes; 3412 buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE; 3413 } 3414 /* else leave those fields 0 like simple_statfs */ 3415 3416 buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); 3417 3418 return 0; 3419 } 3420 3421 /* 3422 * File creation. Allocate an inode, and we're done.. 3423 */ 3424 static int 3425 shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, 3426 struct dentry *dentry, umode_t mode, dev_t dev) 3427 { 3428 struct inode *inode; 3429 int error; 3430 3431 inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); 3432 if (IS_ERR(inode)) 3433 return PTR_ERR(inode); 3434 3435 error = simple_acl_create(dir, inode); 3436 if (error) 3437 goto out_iput; 3438 error = security_inode_init_security(inode, dir, &dentry->d_name, 3439 shmem_initxattrs, NULL); 3440 if (error && error != -EOPNOTSUPP) 3441 goto out_iput; 3442 3443 error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); 3444 if (error) 3445 goto out_iput; 3446 3447 dir->i_size += BOGO_DIRENT_SIZE; 3448 inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); 3449 inode_inc_iversion(dir); 3450 d_instantiate(dentry, inode); 3451 dget(dentry); /* Extra count - pin the dentry in core */ 3452 return error; 3453 3454 out_iput: 3455 iput(inode); 3456 return error; 3457 } 3458 3459 static int 3460 shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, 3461 struct file *file, umode_t mode) 3462 { 3463 struct inode *inode; 3464 int error; 3465 3466 inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); 3467 if (IS_ERR(inode)) { 3468 error = PTR_ERR(inode); 3469 goto err_out; 3470 } 3471 error = security_inode_init_security(inode, dir, NULL, 3472 shmem_initxattrs, NULL); 3473 if (error && error != -EOPNOTSUPP) 3474 goto out_iput; 3475 error = simple_acl_create(dir, inode); 3476 if (error) 3477 goto out_iput; 3478 d_tmpfile(file, inode); 3479 3480 err_out: 3481 return finish_open_simple(file, error); 3482 out_iput: 3483 iput(inode); 3484 return error; 3485 } 3486 3487 static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, 3488 struct dentry *dentry, umode_t mode) 3489 { 3490 int error; 3491 3492 error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); 3493 if (error) 3494 return error; 3495 inc_nlink(dir); 3496 return 0; 3497 } 3498 3499 static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, 3500 struct dentry *dentry, umode_t mode, bool excl) 3501 { 3502 return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); 3503 } 3504 3505 /* 3506 * Link a file.. 3507 */ 3508 static int shmem_link(struct dentry *old_dentry, struct inode *dir, 3509 struct dentry *dentry) 3510 { 3511 struct inode *inode = d_inode(old_dentry); 3512 int ret = 0; 3513 3514 /* 3515 * No ordinary (disk based) filesystem counts links as inodes; 3516 * but each new link needs a new dentry, pinning lowmem, and 3517 * tmpfs dentries cannot be pruned until they are unlinked. 3518 * But if an O_TMPFILE file is linked into the tmpfs, the 3519 * first link must skip that, to get the accounting right. 3520 */ 3521 if (inode->i_nlink) { 3522 ret = shmem_reserve_inode(inode->i_sb, NULL); 3523 if (ret) 3524 goto out; 3525 } 3526 3527 ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); 3528 if (ret) { 3529 if (inode->i_nlink) 3530 shmem_free_inode(inode->i_sb, 0); 3531 goto out; 3532 } 3533 3534 dir->i_size += BOGO_DIRENT_SIZE; 3535 inode_set_mtime_to_ts(dir, 3536 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); 3537 inode_inc_iversion(dir); 3538 inc_nlink(inode); 3539 ihold(inode); /* New dentry reference */ 3540 dget(dentry); /* Extra pinning count for the created dentry */ 3541 d_instantiate(dentry, inode); 3542 out: 3543 return ret; 3544 } 3545 3546 static int shmem_unlink(struct inode *dir, struct dentry *dentry) 3547 { 3548 struct inode *inode = d_inode(dentry); 3549 3550 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 3551 shmem_free_inode(inode->i_sb, 0); 3552 3553 simple_offset_remove(shmem_get_offset_ctx(dir), dentry); 3554 3555 dir->i_size -= BOGO_DIRENT_SIZE; 3556 inode_set_mtime_to_ts(dir, 3557 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); 3558 inode_inc_iversion(dir); 3559 drop_nlink(inode); 3560 dput(dentry); /* Undo the count from "create" - does all the work */ 3561 return 0; 3562 } 3563 3564 static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 3565 { 3566 if (!simple_offset_empty(dentry)) 3567 return -ENOTEMPTY; 3568 3569 drop_nlink(d_inode(dentry)); 3570 drop_nlink(dir); 3571 return shmem_unlink(dir, dentry); 3572 } 3573 3574 static int shmem_whiteout(struct mnt_idmap *idmap, 3575 struct inode *old_dir, struct dentry *old_dentry) 3576 { 3577 struct dentry *whiteout; 3578 int error; 3579 3580 whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); 3581 if (!whiteout) 3582 return -ENOMEM; 3583 3584 error = shmem_mknod(idmap, old_dir, whiteout, 3585 S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); 3586 dput(whiteout); 3587 if (error) 3588 return error; 3589 3590 /* 3591 * Cheat and hash the whiteout while the old dentry is still in 3592 * place, instead of playing games with FS_RENAME_DOES_D_MOVE. 3593 * 3594 * d_lookup() will consistently find one of them at this point, 3595 * not sure which one, but that isn't even important. 3596 */ 3597 d_rehash(whiteout); 3598 return 0; 3599 } 3600 3601 /* 3602 * The VFS layer already does all the dentry stuff for rename, 3603 * we just have to decrement the usage count for the target if 3604 * it exists so that the VFS layer correctly free's it when it 3605 * gets overwritten. 3606 */ 3607 static int shmem_rename2(struct mnt_idmap *idmap, 3608 struct inode *old_dir, struct dentry *old_dentry, 3609 struct inode *new_dir, struct dentry *new_dentry, 3610 unsigned int flags) 3611 { 3612 struct inode *inode = d_inode(old_dentry); 3613 int they_are_dirs = S_ISDIR(inode->i_mode); 3614 int error; 3615 3616 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 3617 return -EINVAL; 3618 3619 if (flags & RENAME_EXCHANGE) 3620 return simple_offset_rename_exchange(old_dir, old_dentry, 3621 new_dir, new_dentry); 3622 3623 if (!simple_offset_empty(new_dentry)) 3624 return -ENOTEMPTY; 3625 3626 if (flags & RENAME_WHITEOUT) { 3627 error = shmem_whiteout(idmap, old_dir, old_dentry); 3628 if (error) 3629 return error; 3630 } 3631 3632 error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); 3633 if (error) 3634 return error; 3635 3636 if (d_really_is_positive(new_dentry)) { 3637 (void) shmem_unlink(new_dir, new_dentry); 3638 if (they_are_dirs) { 3639 drop_nlink(d_inode(new_dentry)); 3640 drop_nlink(old_dir); 3641 } 3642 } else if (they_are_dirs) { 3643 drop_nlink(old_dir); 3644 inc_nlink(new_dir); 3645 } 3646 3647 old_dir->i_size -= BOGO_DIRENT_SIZE; 3648 new_dir->i_size += BOGO_DIRENT_SIZE; 3649 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); 3650 inode_inc_iversion(old_dir); 3651 inode_inc_iversion(new_dir); 3652 return 0; 3653 } 3654 3655 static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, 3656 struct dentry *dentry, const char *symname) 3657 { 3658 int error; 3659 int len; 3660 struct inode *inode; 3661 struct folio *folio; 3662 3663 len = strlen(symname) + 1; 3664 if (len > PAGE_SIZE) 3665 return -ENAMETOOLONG; 3666 3667 inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, 3668 VM_NORESERVE); 3669 if (IS_ERR(inode)) 3670 return PTR_ERR(inode); 3671 3672 error = security_inode_init_security(inode, dir, &dentry->d_name, 3673 shmem_initxattrs, NULL); 3674 if (error && error != -EOPNOTSUPP) 3675 goto out_iput; 3676 3677 error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); 3678 if (error) 3679 goto out_iput; 3680 3681 inode->i_size = len-1; 3682 if (len <= SHORT_SYMLINK_LEN) { 3683 inode->i_link = kmemdup(symname, len, GFP_KERNEL); 3684 if (!inode->i_link) { 3685 error = -ENOMEM; 3686 goto out_remove_offset; 3687 } 3688 inode->i_op = &shmem_short_symlink_operations; 3689 } else { 3690 inode_nohighmem(inode); 3691 inode->i_mapping->a_ops = &shmem_aops; 3692 error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); 3693 if (error) 3694 goto out_remove_offset; 3695 inode->i_op = &shmem_symlink_inode_operations; 3696 memcpy(folio_address(folio), symname, len); 3697 folio_mark_uptodate(folio); 3698 folio_mark_dirty(folio); 3699 folio_unlock(folio); 3700 folio_put(folio); 3701 } 3702 dir->i_size += BOGO_DIRENT_SIZE; 3703 inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); 3704 inode_inc_iversion(dir); 3705 d_instantiate(dentry, inode); 3706 dget(dentry); 3707 return 0; 3708 3709 out_remove_offset: 3710 simple_offset_remove(shmem_get_offset_ctx(dir), dentry); 3711 out_iput: 3712 iput(inode); 3713 return error; 3714 } 3715 3716 static void shmem_put_link(void *arg) 3717 { 3718 folio_mark_accessed(arg); 3719 folio_put(arg); 3720 } 3721 3722 static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, 3723 struct delayed_call *done) 3724 { 3725 struct folio *folio = NULL; 3726 int error; 3727 3728 if (!dentry) { 3729 folio = filemap_get_folio(inode->i_mapping, 0); 3730 if (IS_ERR(folio)) 3731 return ERR_PTR(-ECHILD); 3732 if (PageHWPoison(folio_page(folio, 0)) || 3733 !folio_test_uptodate(folio)) { 3734 folio_put(folio); 3735 return ERR_PTR(-ECHILD); 3736 } 3737 } else { 3738 error = shmem_get_folio(inode, 0, &folio, SGP_READ); 3739 if (error) 3740 return ERR_PTR(error); 3741 if (!folio) 3742 return ERR_PTR(-ECHILD); 3743 if (PageHWPoison(folio_page(folio, 0))) { 3744 folio_unlock(folio); 3745 folio_put(folio); 3746 return ERR_PTR(-ECHILD); 3747 } 3748 folio_unlock(folio); 3749 } 3750 set_delayed_call(done, shmem_put_link, folio); 3751 return folio_address(folio); 3752 } 3753 3754 #ifdef CONFIG_TMPFS_XATTR 3755 3756 static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) 3757 { 3758 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 3759 3760 fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); 3761 3762 return 0; 3763 } 3764 3765 static int shmem_fileattr_set(struct mnt_idmap *idmap, 3766 struct dentry *dentry, struct fileattr *fa) 3767 { 3768 struct inode *inode = d_inode(dentry); 3769 struct shmem_inode_info *info = SHMEM_I(inode); 3770 3771 if (fileattr_has_fsx(fa)) 3772 return -EOPNOTSUPP; 3773 if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) 3774 return -EOPNOTSUPP; 3775 3776 info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | 3777 (fa->flags & SHMEM_FL_USER_MODIFIABLE); 3778 3779 shmem_set_inode_flags(inode, info->fsflags); 3780 inode_set_ctime_current(inode); 3781 inode_inc_iversion(inode); 3782 return 0; 3783 } 3784 3785 /* 3786 * Superblocks without xattr inode operations may get some security.* xattr 3787 * support from the LSM "for free". As soon as we have any other xattrs 3788 * like ACLs, we also need to implement the security.* handlers at 3789 * filesystem level, though. 3790 */ 3791 3792 /* 3793 * Callback for security_inode_init_security() for acquiring xattrs. 3794 */ 3795 static int shmem_initxattrs(struct inode *inode, 3796 const struct xattr *xattr_array, void *fs_info) 3797 { 3798 struct shmem_inode_info *info = SHMEM_I(inode); 3799 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 3800 const struct xattr *xattr; 3801 struct simple_xattr *new_xattr; 3802 size_t ispace = 0; 3803 size_t len; 3804 3805 if (sbinfo->max_inodes) { 3806 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 3807 ispace += simple_xattr_space(xattr->name, 3808 xattr->value_len + XATTR_SECURITY_PREFIX_LEN); 3809 } 3810 if (ispace) { 3811 raw_spin_lock(&sbinfo->stat_lock); 3812 if (sbinfo->free_ispace < ispace) 3813 ispace = 0; 3814 else 3815 sbinfo->free_ispace -= ispace; 3816 raw_spin_unlock(&sbinfo->stat_lock); 3817 if (!ispace) 3818 return -ENOSPC; 3819 } 3820 } 3821 3822 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 3823 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); 3824 if (!new_xattr) 3825 break; 3826 3827 len = strlen(xattr->name) + 1; 3828 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, 3829 GFP_KERNEL_ACCOUNT); 3830 if (!new_xattr->name) { 3831 kvfree(new_xattr); 3832 break; 3833 } 3834 3835 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, 3836 XATTR_SECURITY_PREFIX_LEN); 3837 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 3838 xattr->name, len); 3839 3840 simple_xattr_add(&info->xattrs, new_xattr); 3841 } 3842 3843 if (xattr->name != NULL) { 3844 if (ispace) { 3845 raw_spin_lock(&sbinfo->stat_lock); 3846 sbinfo->free_ispace += ispace; 3847 raw_spin_unlock(&sbinfo->stat_lock); 3848 } 3849 simple_xattrs_free(&info->xattrs, NULL); 3850 return -ENOMEM; 3851 } 3852 3853 return 0; 3854 } 3855 3856 static int shmem_xattr_handler_get(const struct xattr_handler *handler, 3857 struct dentry *unused, struct inode *inode, 3858 const char *name, void *buffer, size_t size) 3859 { 3860 struct shmem_inode_info *info = SHMEM_I(inode); 3861 3862 name = xattr_full_name(handler, name); 3863 return simple_xattr_get(&info->xattrs, name, buffer, size); 3864 } 3865 3866 static int shmem_xattr_handler_set(const struct xattr_handler *handler, 3867 struct mnt_idmap *idmap, 3868 struct dentry *unused, struct inode *inode, 3869 const char *name, const void *value, 3870 size_t size, int flags) 3871 { 3872 struct shmem_inode_info *info = SHMEM_I(inode); 3873 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 3874 struct simple_xattr *old_xattr; 3875 size_t ispace = 0; 3876 3877 name = xattr_full_name(handler, name); 3878 if (value && sbinfo->max_inodes) { 3879 ispace = simple_xattr_space(name, size); 3880 raw_spin_lock(&sbinfo->stat_lock); 3881 if (sbinfo->free_ispace < ispace) 3882 ispace = 0; 3883 else 3884 sbinfo->free_ispace -= ispace; 3885 raw_spin_unlock(&sbinfo->stat_lock); 3886 if (!ispace) 3887 return -ENOSPC; 3888 } 3889 3890 old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); 3891 if (!IS_ERR(old_xattr)) { 3892 ispace = 0; 3893 if (old_xattr && sbinfo->max_inodes) 3894 ispace = simple_xattr_space(old_xattr->name, 3895 old_xattr->size); 3896 simple_xattr_free(old_xattr); 3897 old_xattr = NULL; 3898 inode_set_ctime_current(inode); 3899 inode_inc_iversion(inode); 3900 } 3901 if (ispace) { 3902 raw_spin_lock(&sbinfo->stat_lock); 3903 sbinfo->free_ispace += ispace; 3904 raw_spin_unlock(&sbinfo->stat_lock); 3905 } 3906 return PTR_ERR(old_xattr); 3907 } 3908 3909 static const struct xattr_handler shmem_security_xattr_handler = { 3910 .prefix = XATTR_SECURITY_PREFIX, 3911 .get = shmem_xattr_handler_get, 3912 .set = shmem_xattr_handler_set, 3913 }; 3914 3915 static const struct xattr_handler shmem_trusted_xattr_handler = { 3916 .prefix = XATTR_TRUSTED_PREFIX, 3917 .get = shmem_xattr_handler_get, 3918 .set = shmem_xattr_handler_set, 3919 }; 3920 3921 static const struct xattr_handler shmem_user_xattr_handler = { 3922 .prefix = XATTR_USER_PREFIX, 3923 .get = shmem_xattr_handler_get, 3924 .set = shmem_xattr_handler_set, 3925 }; 3926 3927 static const struct xattr_handler * const shmem_xattr_handlers[] = { 3928 &shmem_security_xattr_handler, 3929 &shmem_trusted_xattr_handler, 3930 &shmem_user_xattr_handler, 3931 NULL 3932 }; 3933 3934 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 3935 { 3936 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 3937 return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); 3938 } 3939 #endif /* CONFIG_TMPFS_XATTR */ 3940 3941 static const struct inode_operations shmem_short_symlink_operations = { 3942 .getattr = shmem_getattr, 3943 .setattr = shmem_setattr, 3944 .get_link = simple_get_link, 3945 #ifdef CONFIG_TMPFS_XATTR 3946 .listxattr = shmem_listxattr, 3947 #endif 3948 }; 3949 3950 static const struct inode_operations shmem_symlink_inode_operations = { 3951 .getattr = shmem_getattr, 3952 .setattr = shmem_setattr, 3953 .get_link = shmem_get_link, 3954 #ifdef CONFIG_TMPFS_XATTR 3955 .listxattr = shmem_listxattr, 3956 #endif 3957 }; 3958 3959 static struct dentry *shmem_get_parent(struct dentry *child) 3960 { 3961 return ERR_PTR(-ESTALE); 3962 } 3963 3964 static int shmem_match(struct inode *ino, void *vfh) 3965 { 3966 __u32 *fh = vfh; 3967 __u64 inum = fh[2]; 3968 inum = (inum << 32) | fh[1]; 3969 return ino->i_ino == inum && fh[0] == ino->i_generation; 3970 } 3971 3972 /* Find any alias of inode, but prefer a hashed alias */ 3973 static struct dentry *shmem_find_alias(struct inode *inode) 3974 { 3975 struct dentry *alias = d_find_alias(inode); 3976 3977 return alias ?: d_find_any_alias(inode); 3978 } 3979 3980 static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 3981 struct fid *fid, int fh_len, int fh_type) 3982 { 3983 struct inode *inode; 3984 struct dentry *dentry = NULL; 3985 u64 inum; 3986 3987 if (fh_len < 3) 3988 return NULL; 3989 3990 inum = fid->raw[2]; 3991 inum = (inum << 32) | fid->raw[1]; 3992 3993 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 3994 shmem_match, fid->raw); 3995 if (inode) { 3996 dentry = shmem_find_alias(inode); 3997 iput(inode); 3998 } 3999 4000 return dentry; 4001 } 4002 4003 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, 4004 struct inode *parent) 4005 { 4006 if (*len < 3) { 4007 *len = 3; 4008 return FILEID_INVALID; 4009 } 4010 4011 if (inode_unhashed(inode)) { 4012 /* Unfortunately insert_inode_hash is not idempotent, 4013 * so as we hash inodes here rather than at creation 4014 * time, we need a lock to ensure we only try 4015 * to do it once 4016 */ 4017 static DEFINE_SPINLOCK(lock); 4018 spin_lock(&lock); 4019 if (inode_unhashed(inode)) 4020 __insert_inode_hash(inode, 4021 inode->i_ino + inode->i_generation); 4022 spin_unlock(&lock); 4023 } 4024 4025 fh[0] = inode->i_generation; 4026 fh[1] = inode->i_ino; 4027 fh[2] = ((__u64)inode->i_ino) >> 32; 4028 4029 *len = 3; 4030 return 1; 4031 } 4032 4033 static const struct export_operations shmem_export_ops = { 4034 .get_parent = shmem_get_parent, 4035 .encode_fh = shmem_encode_fh, 4036 .fh_to_dentry = shmem_fh_to_dentry, 4037 }; 4038 4039 enum shmem_param { 4040 Opt_gid, 4041 Opt_huge, 4042 Opt_mode, 4043 Opt_mpol, 4044 Opt_nr_blocks, 4045 Opt_nr_inodes, 4046 Opt_size, 4047 Opt_uid, 4048 Opt_inode32, 4049 Opt_inode64, 4050 Opt_noswap, 4051 Opt_quota, 4052 Opt_usrquota, 4053 Opt_grpquota, 4054 Opt_usrquota_block_hardlimit, 4055 Opt_usrquota_inode_hardlimit, 4056 Opt_grpquota_block_hardlimit, 4057 Opt_grpquota_inode_hardlimit, 4058 }; 4059 4060 static const struct constant_table shmem_param_enums_huge[] = { 4061 {"never", SHMEM_HUGE_NEVER }, 4062 {"always", SHMEM_HUGE_ALWAYS }, 4063 {"within_size", SHMEM_HUGE_WITHIN_SIZE }, 4064 {"advise", SHMEM_HUGE_ADVISE }, 4065 {} 4066 }; 4067 4068 const struct fs_parameter_spec shmem_fs_parameters[] = { 4069 fsparam_gid ("gid", Opt_gid), 4070 fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), 4071 fsparam_u32oct("mode", Opt_mode), 4072 fsparam_string("mpol", Opt_mpol), 4073 fsparam_string("nr_blocks", Opt_nr_blocks), 4074 fsparam_string("nr_inodes", Opt_nr_inodes), 4075 fsparam_string("size", Opt_size), 4076 fsparam_uid ("uid", Opt_uid), 4077 fsparam_flag ("inode32", Opt_inode32), 4078 fsparam_flag ("inode64", Opt_inode64), 4079 fsparam_flag ("noswap", Opt_noswap), 4080 #ifdef CONFIG_TMPFS_QUOTA 4081 fsparam_flag ("quota", Opt_quota), 4082 fsparam_flag ("usrquota", Opt_usrquota), 4083 fsparam_flag ("grpquota", Opt_grpquota), 4084 fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit), 4085 fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit), 4086 fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), 4087 fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), 4088 #endif 4089 {} 4090 }; 4091 4092 static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) 4093 { 4094 struct shmem_options *ctx = fc->fs_private; 4095 struct fs_parse_result result; 4096 unsigned long long size; 4097 char *rest; 4098 int opt; 4099 kuid_t kuid; 4100 kgid_t kgid; 4101 4102 opt = fs_parse(fc, shmem_fs_parameters, param, &result); 4103 if (opt < 0) 4104 return opt; 4105 4106 switch (opt) { 4107 case Opt_size: 4108 size = memparse(param->string, &rest); 4109 if (*rest == '%') { 4110 size <<= PAGE_SHIFT; 4111 size *= totalram_pages(); 4112 do_div(size, 100); 4113 rest++; 4114 } 4115 if (*rest) 4116 goto bad_value; 4117 ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); 4118 ctx->seen |= SHMEM_SEEN_BLOCKS; 4119 break; 4120 case Opt_nr_blocks: 4121 ctx->blocks = memparse(param->string, &rest); 4122 if (*rest || ctx->blocks > LONG_MAX) 4123 goto bad_value; 4124 ctx->seen |= SHMEM_SEEN_BLOCKS; 4125 break; 4126 case Opt_nr_inodes: 4127 ctx->inodes = memparse(param->string, &rest); 4128 if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE) 4129 goto bad_value; 4130 ctx->seen |= SHMEM_SEEN_INODES; 4131 break; 4132 case Opt_mode: 4133 ctx->mode = result.uint_32 & 07777; 4134 break; 4135 case Opt_uid: 4136 kuid = result.uid; 4137 4138 /* 4139 * The requested uid must be representable in the 4140 * filesystem's idmapping. 4141 */ 4142 if (!kuid_has_mapping(fc->user_ns, kuid)) 4143 goto bad_value; 4144 4145 ctx->uid = kuid; 4146 break; 4147 case Opt_gid: 4148 kgid = result.gid; 4149 4150 /* 4151 * The requested gid must be representable in the 4152 * filesystem's idmapping. 4153 */ 4154 if (!kgid_has_mapping(fc->user_ns, kgid)) 4155 goto bad_value; 4156 4157 ctx->gid = kgid; 4158 break; 4159 case Opt_huge: 4160 ctx->huge = result.uint_32; 4161 if (ctx->huge != SHMEM_HUGE_NEVER && 4162 !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 4163 has_transparent_hugepage())) 4164 goto unsupported_parameter; 4165 ctx->seen |= SHMEM_SEEN_HUGE; 4166 break; 4167 case Opt_mpol: 4168 if (IS_ENABLED(CONFIG_NUMA)) { 4169 mpol_put(ctx->mpol); 4170 ctx->mpol = NULL; 4171 if (mpol_parse_str(param->string, &ctx->mpol)) 4172 goto bad_value; 4173 break; 4174 } 4175 goto unsupported_parameter; 4176 case Opt_inode32: 4177 ctx->full_inums = false; 4178 ctx->seen |= SHMEM_SEEN_INUMS; 4179 break; 4180 case Opt_inode64: 4181 if (sizeof(ino_t) < 8) { 4182 return invalfc(fc, 4183 "Cannot use inode64 with <64bit inums in kernel\n"); 4184 } 4185 ctx->full_inums = true; 4186 ctx->seen |= SHMEM_SEEN_INUMS; 4187 break; 4188 case Opt_noswap: 4189 if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) { 4190 return invalfc(fc, 4191 "Turning off swap in unprivileged tmpfs mounts unsupported"); 4192 } 4193 ctx->noswap = true; 4194 ctx->seen |= SHMEM_SEEN_NOSWAP; 4195 break; 4196 case Opt_quota: 4197 if (fc->user_ns != &init_user_ns) 4198 return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); 4199 ctx->seen |= SHMEM_SEEN_QUOTA; 4200 ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP); 4201 break; 4202 case Opt_usrquota: 4203 if (fc->user_ns != &init_user_ns) 4204 return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); 4205 ctx->seen |= SHMEM_SEEN_QUOTA; 4206 ctx->quota_types |= QTYPE_MASK_USR; 4207 break; 4208 case Opt_grpquota: 4209 if (fc->user_ns != &init_user_ns) 4210 return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); 4211 ctx->seen |= SHMEM_SEEN_QUOTA; 4212 ctx->quota_types |= QTYPE_MASK_GRP; 4213 break; 4214 case Opt_usrquota_block_hardlimit: 4215 size = memparse(param->string, &rest); 4216 if (*rest || !size) 4217 goto bad_value; 4218 if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) 4219 return invalfc(fc, 4220 "User quota block hardlimit too large."); 4221 ctx->qlimits.usrquota_bhardlimit = size; 4222 break; 4223 case Opt_grpquota_block_hardlimit: 4224 size = memparse(param->string, &rest); 4225 if (*rest || !size) 4226 goto bad_value; 4227 if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) 4228 return invalfc(fc, 4229 "Group quota block hardlimit too large."); 4230 ctx->qlimits.grpquota_bhardlimit = size; 4231 break; 4232 case Opt_usrquota_inode_hardlimit: 4233 size = memparse(param->string, &rest); 4234 if (*rest || !size) 4235 goto bad_value; 4236 if (size > SHMEM_QUOTA_MAX_INO_LIMIT) 4237 return invalfc(fc, 4238 "User quota inode hardlimit too large."); 4239 ctx->qlimits.usrquota_ihardlimit = size; 4240 break; 4241 case Opt_grpquota_inode_hardlimit: 4242 size = memparse(param->string, &rest); 4243 if (*rest || !size) 4244 goto bad_value; 4245 if (size > SHMEM_QUOTA_MAX_INO_LIMIT) 4246 return invalfc(fc, 4247 "Group quota inode hardlimit too large."); 4248 ctx->qlimits.grpquota_ihardlimit = size; 4249 break; 4250 } 4251 return 0; 4252 4253 unsupported_parameter: 4254 return invalfc(fc, "Unsupported parameter '%s'", param->key); 4255 bad_value: 4256 return invalfc(fc, "Bad value for '%s'", param->key); 4257 } 4258 4259 static int shmem_parse_options(struct fs_context *fc, void *data) 4260 { 4261 char *options = data; 4262 4263 if (options) { 4264 int err = security_sb_eat_lsm_opts(options, &fc->security); 4265 if (err) 4266 return err; 4267 } 4268 4269 while (options != NULL) { 4270 char *this_char = options; 4271 for (;;) { 4272 /* 4273 * NUL-terminate this option: unfortunately, 4274 * mount options form a comma-separated list, 4275 * but mpol's nodelist may also contain commas. 4276 */ 4277 options = strchr(options, ','); 4278 if (options == NULL) 4279 break; 4280 options++; 4281 if (!isdigit(*options)) { 4282 options[-1] = '\0'; 4283 break; 4284 } 4285 } 4286 if (*this_char) { 4287 char *value = strchr(this_char, '='); 4288 size_t len = 0; 4289 int err; 4290 4291 if (value) { 4292 *value++ = '\0'; 4293 len = strlen(value); 4294 } 4295 err = vfs_parse_fs_string(fc, this_char, value, len); 4296 if (err < 0) 4297 return err; 4298 } 4299 } 4300 return 0; 4301 } 4302 4303 /* 4304 * Reconfigure a shmem filesystem. 4305 */ 4306 static int shmem_reconfigure(struct fs_context *fc) 4307 { 4308 struct shmem_options *ctx = fc->fs_private; 4309 struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); 4310 unsigned long used_isp; 4311 struct mempolicy *mpol = NULL; 4312 const char *err; 4313 4314 raw_spin_lock(&sbinfo->stat_lock); 4315 used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace; 4316 4317 if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { 4318 if (!sbinfo->max_blocks) { 4319 err = "Cannot retroactively limit size"; 4320 goto out; 4321 } 4322 if (percpu_counter_compare(&sbinfo->used_blocks, 4323 ctx->blocks) > 0) { 4324 err = "Too small a size for current use"; 4325 goto out; 4326 } 4327 } 4328 if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { 4329 if (!sbinfo->max_inodes) { 4330 err = "Cannot retroactively limit inodes"; 4331 goto out; 4332 } 4333 if (ctx->inodes * BOGO_INODE_SIZE < used_isp) { 4334 err = "Too few inodes for current use"; 4335 goto out; 4336 } 4337 } 4338 4339 if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && 4340 sbinfo->next_ino > UINT_MAX) { 4341 err = "Current inum too high to switch to 32-bit inums"; 4342 goto out; 4343 } 4344 if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) { 4345 err = "Cannot disable swap on remount"; 4346 goto out; 4347 } 4348 if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) { 4349 err = "Cannot enable swap on remount if it was disabled on first mount"; 4350 goto out; 4351 } 4352 4353 if (ctx->seen & SHMEM_SEEN_QUOTA && 4354 !sb_any_quota_loaded(fc->root->d_sb)) { 4355 err = "Cannot enable quota on remount"; 4356 goto out; 4357 } 4358 4359 #ifdef CONFIG_TMPFS_QUOTA 4360 #define CHANGED_LIMIT(name) \ 4361 (ctx->qlimits.name## hardlimit && \ 4362 (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit)) 4363 4364 if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) || 4365 CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) { 4366 err = "Cannot change global quota limit on remount"; 4367 goto out; 4368 } 4369 #endif /* CONFIG_TMPFS_QUOTA */ 4370 4371 if (ctx->seen & SHMEM_SEEN_HUGE) 4372 sbinfo->huge = ctx->huge; 4373 if (ctx->seen & SHMEM_SEEN_INUMS) 4374 sbinfo->full_inums = ctx->full_inums; 4375 if (ctx->seen & SHMEM_SEEN_BLOCKS) 4376 sbinfo->max_blocks = ctx->blocks; 4377 if (ctx->seen & SHMEM_SEEN_INODES) { 4378 sbinfo->max_inodes = ctx->inodes; 4379 sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp; 4380 } 4381 4382 /* 4383 * Preserve previous mempolicy unless mpol remount option was specified. 4384 */ 4385 if (ctx->mpol) { 4386 mpol = sbinfo->mpol; 4387 sbinfo->mpol = ctx->mpol; /* transfers initial ref */ 4388 ctx->mpol = NULL; 4389 } 4390 4391 if (ctx->noswap) 4392 sbinfo->noswap = true; 4393 4394 raw_spin_unlock(&sbinfo->stat_lock); 4395 mpol_put(mpol); 4396 return 0; 4397 out: 4398 raw_spin_unlock(&sbinfo->stat_lock); 4399 return invalfc(fc, "%s", err); 4400 } 4401 4402 static int shmem_show_options(struct seq_file *seq, struct dentry *root) 4403 { 4404 struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); 4405 struct mempolicy *mpol; 4406 4407 if (sbinfo->max_blocks != shmem_default_max_blocks()) 4408 seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks)); 4409 if (sbinfo->max_inodes != shmem_default_max_inodes()) 4410 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 4411 if (sbinfo->mode != (0777 | S_ISVTX)) 4412 seq_printf(seq, ",mode=%03ho", sbinfo->mode); 4413 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) 4414 seq_printf(seq, ",uid=%u", 4415 from_kuid_munged(&init_user_ns, sbinfo->uid)); 4416 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 4417 seq_printf(seq, ",gid=%u", 4418 from_kgid_munged(&init_user_ns, sbinfo->gid)); 4419 4420 /* 4421 * Showing inode{64,32} might be useful even if it's the system default, 4422 * since then people don't have to resort to checking both here and 4423 * /proc/config.gz to confirm 64-bit inums were successfully applied 4424 * (which may not even exist if IKCONFIG_PROC isn't enabled). 4425 * 4426 * We hide it when inode64 isn't the default and we are using 32-bit 4427 * inodes, since that probably just means the feature isn't even under 4428 * consideration. 4429 * 4430 * As such: 4431 * 4432 * +-----------------+-----------------+ 4433 * | TMPFS_INODE64=y | TMPFS_INODE64=n | 4434 * +------------------+-----------------+-----------------+ 4435 * | full_inums=true | show | show | 4436 * | full_inums=false | show | hide | 4437 * +------------------+-----------------+-----------------+ 4438 * 4439 */ 4440 if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) 4441 seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); 4442 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4443 /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ 4444 if (sbinfo->huge) 4445 seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); 4446 #endif 4447 mpol = shmem_get_sbmpol(sbinfo); 4448 shmem_show_mpol(seq, mpol); 4449 mpol_put(mpol); 4450 if (sbinfo->noswap) 4451 seq_printf(seq, ",noswap"); 4452 #ifdef CONFIG_TMPFS_QUOTA 4453 if (sb_has_quota_active(root->d_sb, USRQUOTA)) 4454 seq_printf(seq, ",usrquota"); 4455 if (sb_has_quota_active(root->d_sb, GRPQUOTA)) 4456 seq_printf(seq, ",grpquota"); 4457 if (sbinfo->qlimits.usrquota_bhardlimit) 4458 seq_printf(seq, ",usrquota_block_hardlimit=%lld", 4459 sbinfo->qlimits.usrquota_bhardlimit); 4460 if (sbinfo->qlimits.grpquota_bhardlimit) 4461 seq_printf(seq, ",grpquota_block_hardlimit=%lld", 4462 sbinfo->qlimits.grpquota_bhardlimit); 4463 if (sbinfo->qlimits.usrquota_ihardlimit) 4464 seq_printf(seq, ",usrquota_inode_hardlimit=%lld", 4465 sbinfo->qlimits.usrquota_ihardlimit); 4466 if (sbinfo->qlimits.grpquota_ihardlimit) 4467 seq_printf(seq, ",grpquota_inode_hardlimit=%lld", 4468 sbinfo->qlimits.grpquota_ihardlimit); 4469 #endif 4470 return 0; 4471 } 4472 4473 #endif /* CONFIG_TMPFS */ 4474 4475 static void shmem_put_super(struct super_block *sb) 4476 { 4477 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 4478 4479 #ifdef CONFIG_TMPFS_QUOTA 4480 shmem_disable_quotas(sb); 4481 #endif 4482 free_percpu(sbinfo->ino_batch); 4483 percpu_counter_destroy(&sbinfo->used_blocks); 4484 mpol_put(sbinfo->mpol); 4485 kfree(sbinfo); 4486 sb->s_fs_info = NULL; 4487 } 4488 4489 static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) 4490 { 4491 struct shmem_options *ctx = fc->fs_private; 4492 struct inode *inode; 4493 struct shmem_sb_info *sbinfo; 4494 int error = -ENOMEM; 4495 4496 /* Round up to L1_CACHE_BYTES to resist false sharing */ 4497 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 4498 L1_CACHE_BYTES), GFP_KERNEL); 4499 if (!sbinfo) 4500 return error; 4501 4502 sb->s_fs_info = sbinfo; 4503 4504 #ifdef CONFIG_TMPFS 4505 /* 4506 * Per default we only allow half of the physical ram per 4507 * tmpfs instance, limiting inodes to one per page of lowmem; 4508 * but the internal instance is left unlimited. 4509 */ 4510 if (!(sb->s_flags & SB_KERNMOUNT)) { 4511 if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) 4512 ctx->blocks = shmem_default_max_blocks(); 4513 if (!(ctx->seen & SHMEM_SEEN_INODES)) 4514 ctx->inodes = shmem_default_max_inodes(); 4515 if (!(ctx->seen & SHMEM_SEEN_INUMS)) 4516 ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); 4517 sbinfo->noswap = ctx->noswap; 4518 } else { 4519 sb->s_flags |= SB_NOUSER; 4520 } 4521 sb->s_export_op = &shmem_export_ops; 4522 sb->s_flags |= SB_NOSEC | SB_I_VERSION; 4523 #else 4524 sb->s_flags |= SB_NOUSER; 4525 #endif 4526 sbinfo->max_blocks = ctx->blocks; 4527 sbinfo->max_inodes = ctx->inodes; 4528 sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; 4529 if (sb->s_flags & SB_KERNMOUNT) { 4530 sbinfo->ino_batch = alloc_percpu(ino_t); 4531 if (!sbinfo->ino_batch) 4532 goto failed; 4533 } 4534 sbinfo->uid = ctx->uid; 4535 sbinfo->gid = ctx->gid; 4536 sbinfo->full_inums = ctx->full_inums; 4537 sbinfo->mode = ctx->mode; 4538 sbinfo->huge = ctx->huge; 4539 sbinfo->mpol = ctx->mpol; 4540 ctx->mpol = NULL; 4541 4542 raw_spin_lock_init(&sbinfo->stat_lock); 4543 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) 4544 goto failed; 4545 spin_lock_init(&sbinfo->shrinklist_lock); 4546 INIT_LIST_HEAD(&sbinfo->shrinklist); 4547 4548 sb->s_maxbytes = MAX_LFS_FILESIZE; 4549 sb->s_blocksize = PAGE_SIZE; 4550 sb->s_blocksize_bits = PAGE_SHIFT; 4551 sb->s_magic = TMPFS_MAGIC; 4552 sb->s_op = &shmem_ops; 4553 sb->s_time_gran = 1; 4554 #ifdef CONFIG_TMPFS_XATTR 4555 sb->s_xattr = shmem_xattr_handlers; 4556 #endif 4557 #ifdef CONFIG_TMPFS_POSIX_ACL 4558 sb->s_flags |= SB_POSIXACL; 4559 #endif 4560 uuid_t uuid; 4561 uuid_gen(&uuid); 4562 super_set_uuid(sb, uuid.b, sizeof(uuid)); 4563 4564 #ifdef CONFIG_TMPFS_QUOTA 4565 if (ctx->seen & SHMEM_SEEN_QUOTA) { 4566 sb->dq_op = &shmem_quota_operations; 4567 sb->s_qcop = &dquot_quotactl_sysfile_ops; 4568 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; 4569 4570 /* Copy the default limits from ctx into sbinfo */ 4571 memcpy(&sbinfo->qlimits, &ctx->qlimits, 4572 sizeof(struct shmem_quota_limits)); 4573 4574 if (shmem_enable_quotas(sb, ctx->quota_types)) 4575 goto failed; 4576 } 4577 #endif /* CONFIG_TMPFS_QUOTA */ 4578 4579 inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, 4580 S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); 4581 if (IS_ERR(inode)) { 4582 error = PTR_ERR(inode); 4583 goto failed; 4584 } 4585 inode->i_uid = sbinfo->uid; 4586 inode->i_gid = sbinfo->gid; 4587 sb->s_root = d_make_root(inode); 4588 if (!sb->s_root) 4589 goto failed; 4590 return 0; 4591 4592 failed: 4593 shmem_put_super(sb); 4594 return error; 4595 } 4596 4597 static int shmem_get_tree(struct fs_context *fc) 4598 { 4599 return get_tree_nodev(fc, shmem_fill_super); 4600 } 4601 4602 static void shmem_free_fc(struct fs_context *fc) 4603 { 4604 struct shmem_options *ctx = fc->fs_private; 4605 4606 if (ctx) { 4607 mpol_put(ctx->mpol); 4608 kfree(ctx); 4609 } 4610 } 4611 4612 static const struct fs_context_operations shmem_fs_context_ops = { 4613 .free = shmem_free_fc, 4614 .get_tree = shmem_get_tree, 4615 #ifdef CONFIG_TMPFS 4616 .parse_monolithic = shmem_parse_options, 4617 .parse_param = shmem_parse_one, 4618 .reconfigure = shmem_reconfigure, 4619 #endif 4620 }; 4621 4622 static struct kmem_cache *shmem_inode_cachep __ro_after_init; 4623 4624 static struct inode *shmem_alloc_inode(struct super_block *sb) 4625 { 4626 struct shmem_inode_info *info; 4627 info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); 4628 if (!info) 4629 return NULL; 4630 return &info->vfs_inode; 4631 } 4632 4633 static void shmem_free_in_core_inode(struct inode *inode) 4634 { 4635 if (S_ISLNK(inode->i_mode)) 4636 kfree(inode->i_link); 4637 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 4638 } 4639 4640 static void shmem_destroy_inode(struct inode *inode) 4641 { 4642 if (S_ISREG(inode->i_mode)) 4643 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 4644 if (S_ISDIR(inode->i_mode)) 4645 simple_offset_destroy(shmem_get_offset_ctx(inode)); 4646 } 4647 4648 static void shmem_init_inode(void *foo) 4649 { 4650 struct shmem_inode_info *info = foo; 4651 inode_init_once(&info->vfs_inode); 4652 } 4653 4654 static void __init shmem_init_inodecache(void) 4655 { 4656 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 4657 sizeof(struct shmem_inode_info), 4658 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); 4659 } 4660 4661 static void __init shmem_destroy_inodecache(void) 4662 { 4663 kmem_cache_destroy(shmem_inode_cachep); 4664 } 4665 4666 /* Keep the page in page cache instead of truncating it */ 4667 static int shmem_error_remove_folio(struct address_space *mapping, 4668 struct folio *folio) 4669 { 4670 return 0; 4671 } 4672 4673 static const struct address_space_operations shmem_aops = { 4674 .writepage = shmem_writepage, 4675 .dirty_folio = noop_dirty_folio, 4676 #ifdef CONFIG_TMPFS 4677 .write_begin = shmem_write_begin, 4678 .write_end = shmem_write_end, 4679 #endif 4680 #ifdef CONFIG_MIGRATION 4681 .migrate_folio = migrate_folio, 4682 #endif 4683 .error_remove_folio = shmem_error_remove_folio, 4684 }; 4685 4686 static const struct file_operations shmem_file_operations = { 4687 .mmap = shmem_mmap, 4688 .open = shmem_file_open, 4689 .get_unmapped_area = shmem_get_unmapped_area, 4690 #ifdef CONFIG_TMPFS 4691 .llseek = shmem_file_llseek, 4692 .read_iter = shmem_file_read_iter, 4693 .write_iter = shmem_file_write_iter, 4694 .fsync = noop_fsync, 4695 .splice_read = shmem_file_splice_read, 4696 .splice_write = iter_file_splice_write, 4697 .fallocate = shmem_fallocate, 4698 #endif 4699 }; 4700 4701 static const struct inode_operations shmem_inode_operations = { 4702 .getattr = shmem_getattr, 4703 .setattr = shmem_setattr, 4704 #ifdef CONFIG_TMPFS_XATTR 4705 .listxattr = shmem_listxattr, 4706 .set_acl = simple_set_acl, 4707 .fileattr_get = shmem_fileattr_get, 4708 .fileattr_set = shmem_fileattr_set, 4709 #endif 4710 }; 4711 4712 static const struct inode_operations shmem_dir_inode_operations = { 4713 #ifdef CONFIG_TMPFS 4714 .getattr = shmem_getattr, 4715 .create = shmem_create, 4716 .lookup = simple_lookup, 4717 .link = shmem_link, 4718 .unlink = shmem_unlink, 4719 .symlink = shmem_symlink, 4720 .mkdir = shmem_mkdir, 4721 .rmdir = shmem_rmdir, 4722 .mknod = shmem_mknod, 4723 .rename = shmem_rename2, 4724 .tmpfile = shmem_tmpfile, 4725 .get_offset_ctx = shmem_get_offset_ctx, 4726 #endif 4727 #ifdef CONFIG_TMPFS_XATTR 4728 .listxattr = shmem_listxattr, 4729 .fileattr_get = shmem_fileattr_get, 4730 .fileattr_set = shmem_fileattr_set, 4731 #endif 4732 #ifdef CONFIG_TMPFS_POSIX_ACL 4733 .setattr = shmem_setattr, 4734 .set_acl = simple_set_acl, 4735 #endif 4736 }; 4737 4738 static const struct inode_operations shmem_special_inode_operations = { 4739 .getattr = shmem_getattr, 4740 #ifdef CONFIG_TMPFS_XATTR 4741 .listxattr = shmem_listxattr, 4742 #endif 4743 #ifdef CONFIG_TMPFS_POSIX_ACL 4744 .setattr = shmem_setattr, 4745 .set_acl = simple_set_acl, 4746 #endif 4747 }; 4748 4749 static const struct super_operations shmem_ops = { 4750 .alloc_inode = shmem_alloc_inode, 4751 .free_inode = shmem_free_in_core_inode, 4752 .destroy_inode = shmem_destroy_inode, 4753 #ifdef CONFIG_TMPFS 4754 .statfs = shmem_statfs, 4755 .show_options = shmem_show_options, 4756 #endif 4757 #ifdef CONFIG_TMPFS_QUOTA 4758 .get_dquots = shmem_get_dquots, 4759 #endif 4760 .evict_inode = shmem_evict_inode, 4761 .drop_inode = generic_delete_inode, 4762 .put_super = shmem_put_super, 4763 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4764 .nr_cached_objects = shmem_unused_huge_count, 4765 .free_cached_objects = shmem_unused_huge_scan, 4766 #endif 4767 }; 4768 4769 static const struct vm_operations_struct shmem_vm_ops = { 4770 .fault = shmem_fault, 4771 .map_pages = filemap_map_pages, 4772 #ifdef CONFIG_NUMA 4773 .set_policy = shmem_set_policy, 4774 .get_policy = shmem_get_policy, 4775 #endif 4776 }; 4777 4778 static const struct vm_operations_struct shmem_anon_vm_ops = { 4779 .fault = shmem_fault, 4780 .map_pages = filemap_map_pages, 4781 #ifdef CONFIG_NUMA 4782 .set_policy = shmem_set_policy, 4783 .get_policy = shmem_get_policy, 4784 #endif 4785 }; 4786 4787 int shmem_init_fs_context(struct fs_context *fc) 4788 { 4789 struct shmem_options *ctx; 4790 4791 ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL); 4792 if (!ctx) 4793 return -ENOMEM; 4794 4795 ctx->mode = 0777 | S_ISVTX; 4796 ctx->uid = current_fsuid(); 4797 ctx->gid = current_fsgid(); 4798 4799 fc->fs_private = ctx; 4800 fc->ops = &shmem_fs_context_ops; 4801 return 0; 4802 } 4803 4804 static struct file_system_type shmem_fs_type = { 4805 .owner = THIS_MODULE, 4806 .name = "tmpfs", 4807 .init_fs_context = shmem_init_fs_context, 4808 #ifdef CONFIG_TMPFS 4809 .parameters = shmem_fs_parameters, 4810 #endif 4811 .kill_sb = kill_litter_super, 4812 .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, 4813 }; 4814 4815 void __init shmem_init(void) 4816 { 4817 int error; 4818 4819 shmem_init_inodecache(); 4820 4821 #ifdef CONFIG_TMPFS_QUOTA 4822 error = register_quota_format(&shmem_quota_format); 4823 if (error < 0) { 4824 pr_err("Could not register quota format\n"); 4825 goto out3; 4826 } 4827 #endif 4828 4829 error = register_filesystem(&shmem_fs_type); 4830 if (error) { 4831 pr_err("Could not register tmpfs\n"); 4832 goto out2; 4833 } 4834 4835 shm_mnt = kern_mount(&shmem_fs_type); 4836 if (IS_ERR(shm_mnt)) { 4837 error = PTR_ERR(shm_mnt); 4838 pr_err("Could not kern_mount tmpfs\n"); 4839 goto out1; 4840 } 4841 4842 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4843 if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) 4844 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 4845 else 4846 shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ 4847 4848 /* 4849 * Default to setting PMD-sized THP to inherit the global setting and 4850 * disable all other multi-size THPs. 4851 */ 4852 huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); 4853 #endif 4854 return; 4855 4856 out1: 4857 unregister_filesystem(&shmem_fs_type); 4858 out2: 4859 #ifdef CONFIG_TMPFS_QUOTA 4860 unregister_quota_format(&shmem_quota_format); 4861 out3: 4862 #endif 4863 shmem_destroy_inodecache(); 4864 shm_mnt = ERR_PTR(error); 4865 } 4866 4867 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) 4868 static ssize_t shmem_enabled_show(struct kobject *kobj, 4869 struct kobj_attribute *attr, char *buf) 4870 { 4871 static const int values[] = { 4872 SHMEM_HUGE_ALWAYS, 4873 SHMEM_HUGE_WITHIN_SIZE, 4874 SHMEM_HUGE_ADVISE, 4875 SHMEM_HUGE_NEVER, 4876 SHMEM_HUGE_DENY, 4877 SHMEM_HUGE_FORCE, 4878 }; 4879 int len = 0; 4880 int i; 4881 4882 for (i = 0; i < ARRAY_SIZE(values); i++) { 4883 len += sysfs_emit_at(buf, len, 4884 shmem_huge == values[i] ? "%s[%s]" : "%s%s", 4885 i ? " " : "", shmem_format_huge(values[i])); 4886 } 4887 len += sysfs_emit_at(buf, len, "\n"); 4888 4889 return len; 4890 } 4891 4892 static ssize_t shmem_enabled_store(struct kobject *kobj, 4893 struct kobj_attribute *attr, const char *buf, size_t count) 4894 { 4895 char tmp[16]; 4896 int huge; 4897 4898 if (count + 1 > sizeof(tmp)) 4899 return -EINVAL; 4900 memcpy(tmp, buf, count); 4901 tmp[count] = '\0'; 4902 if (count && tmp[count - 1] == '\n') 4903 tmp[count - 1] = '\0'; 4904 4905 huge = shmem_parse_huge(tmp); 4906 if (huge == -EINVAL) 4907 return -EINVAL; 4908 if (!has_transparent_hugepage() && 4909 huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) 4910 return -EINVAL; 4911 4912 /* Do not override huge allocation policy with non-PMD sized mTHP */ 4913 if (huge == SHMEM_HUGE_FORCE && 4914 huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) 4915 return -EINVAL; 4916 4917 shmem_huge = huge; 4918 if (shmem_huge > SHMEM_HUGE_DENY) 4919 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 4920 return count; 4921 } 4922 4923 struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); 4924 static DEFINE_SPINLOCK(huge_shmem_orders_lock); 4925 4926 static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, 4927 struct kobj_attribute *attr, char *buf) 4928 { 4929 int order = to_thpsize(kobj)->order; 4930 const char *output; 4931 4932 if (test_bit(order, &huge_shmem_orders_always)) 4933 output = "[always] inherit within_size advise never"; 4934 else if (test_bit(order, &huge_shmem_orders_inherit)) 4935 output = "always [inherit] within_size advise never"; 4936 else if (test_bit(order, &huge_shmem_orders_within_size)) 4937 output = "always inherit [within_size] advise never"; 4938 else if (test_bit(order, &huge_shmem_orders_madvise)) 4939 output = "always inherit within_size [advise] never"; 4940 else 4941 output = "always inherit within_size advise [never]"; 4942 4943 return sysfs_emit(buf, "%s\n", output); 4944 } 4945 4946 static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, 4947 struct kobj_attribute *attr, 4948 const char *buf, size_t count) 4949 { 4950 int order = to_thpsize(kobj)->order; 4951 ssize_t ret = count; 4952 4953 if (sysfs_streq(buf, "always")) { 4954 spin_lock(&huge_shmem_orders_lock); 4955 clear_bit(order, &huge_shmem_orders_inherit); 4956 clear_bit(order, &huge_shmem_orders_madvise); 4957 clear_bit(order, &huge_shmem_orders_within_size); 4958 set_bit(order, &huge_shmem_orders_always); 4959 spin_unlock(&huge_shmem_orders_lock); 4960 } else if (sysfs_streq(buf, "inherit")) { 4961 /* Do not override huge allocation policy with non-PMD sized mTHP */ 4962 if (shmem_huge == SHMEM_HUGE_FORCE && 4963 order != HPAGE_PMD_ORDER) 4964 return -EINVAL; 4965 4966 spin_lock(&huge_shmem_orders_lock); 4967 clear_bit(order, &huge_shmem_orders_always); 4968 clear_bit(order, &huge_shmem_orders_madvise); 4969 clear_bit(order, &huge_shmem_orders_within_size); 4970 set_bit(order, &huge_shmem_orders_inherit); 4971 spin_unlock(&huge_shmem_orders_lock); 4972 } else if (sysfs_streq(buf, "within_size")) { 4973 spin_lock(&huge_shmem_orders_lock); 4974 clear_bit(order, &huge_shmem_orders_always); 4975 clear_bit(order, &huge_shmem_orders_inherit); 4976 clear_bit(order, &huge_shmem_orders_madvise); 4977 set_bit(order, &huge_shmem_orders_within_size); 4978 spin_unlock(&huge_shmem_orders_lock); 4979 } else if (sysfs_streq(buf, "advise")) { 4980 spin_lock(&huge_shmem_orders_lock); 4981 clear_bit(order, &huge_shmem_orders_always); 4982 clear_bit(order, &huge_shmem_orders_inherit); 4983 clear_bit(order, &huge_shmem_orders_within_size); 4984 set_bit(order, &huge_shmem_orders_madvise); 4985 spin_unlock(&huge_shmem_orders_lock); 4986 } else if (sysfs_streq(buf, "never")) { 4987 spin_lock(&huge_shmem_orders_lock); 4988 clear_bit(order, &huge_shmem_orders_always); 4989 clear_bit(order, &huge_shmem_orders_inherit); 4990 clear_bit(order, &huge_shmem_orders_within_size); 4991 clear_bit(order, &huge_shmem_orders_madvise); 4992 spin_unlock(&huge_shmem_orders_lock); 4993 } else { 4994 ret = -EINVAL; 4995 } 4996 4997 return ret; 4998 } 4999 5000 struct kobj_attribute thpsize_shmem_enabled_attr = 5001 __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); 5002 #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ 5003 5004 #else /* !CONFIG_SHMEM */ 5005 5006 /* 5007 * tiny-shmem: simple shmemfs and tmpfs using ramfs code 5008 * 5009 * This is intended for small system where the benefits of the full 5010 * shmem code (swap-backed and resource-limited) are outweighed by 5011 * their complexity. On systems without swap this code should be 5012 * effectively equivalent, but much lighter weight. 5013 */ 5014 5015 static struct file_system_type shmem_fs_type = { 5016 .name = "tmpfs", 5017 .init_fs_context = ramfs_init_fs_context, 5018 .parameters = ramfs_fs_parameters, 5019 .kill_sb = ramfs_kill_sb, 5020 .fs_flags = FS_USERNS_MOUNT, 5021 }; 5022 5023 void __init shmem_init(void) 5024 { 5025 BUG_ON(register_filesystem(&shmem_fs_type) != 0); 5026 5027 shm_mnt = kern_mount(&shmem_fs_type); 5028 BUG_ON(IS_ERR(shm_mnt)); 5029 } 5030 5031 int shmem_unuse(unsigned int type) 5032 { 5033 return 0; 5034 } 5035 5036 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 5037 { 5038 return 0; 5039 } 5040 5041 void shmem_unlock_mapping(struct address_space *mapping) 5042 { 5043 } 5044 5045 #ifdef CONFIG_MMU 5046 unsigned long shmem_get_unmapped_area(struct file *file, 5047 unsigned long addr, unsigned long len, 5048 unsigned long pgoff, unsigned long flags) 5049 { 5050 return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); 5051 } 5052 #endif 5053 5054 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 5055 { 5056 truncate_inode_pages_range(inode->i_mapping, lstart, lend); 5057 } 5058 EXPORT_SYMBOL_GPL(shmem_truncate_range); 5059 5060 #define shmem_vm_ops generic_file_vm_ops 5061 #define shmem_anon_vm_ops generic_file_vm_ops 5062 #define shmem_file_operations ramfs_file_operations 5063 #define shmem_acct_size(flags, size) 0 5064 #define shmem_unacct_size(flags, size) do {} while (0) 5065 5066 static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, 5067 struct super_block *sb, struct inode *dir, 5068 umode_t mode, dev_t dev, unsigned long flags) 5069 { 5070 struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); 5071 return inode ? inode : ERR_PTR(-ENOSPC); 5072 } 5073 5074 #endif /* CONFIG_SHMEM */ 5075 5076 /* common code */ 5077 5078 static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, 5079 loff_t size, unsigned long flags, unsigned int i_flags) 5080 { 5081 struct inode *inode; 5082 struct file *res; 5083 5084 if (IS_ERR(mnt)) 5085 return ERR_CAST(mnt); 5086 5087 if (size < 0 || size > MAX_LFS_FILESIZE) 5088 return ERR_PTR(-EINVAL); 5089 5090 if (shmem_acct_size(flags, size)) 5091 return ERR_PTR(-ENOMEM); 5092 5093 if (is_idmapped_mnt(mnt)) 5094 return ERR_PTR(-EINVAL); 5095 5096 inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, 5097 S_IFREG | S_IRWXUGO, 0, flags); 5098 if (IS_ERR(inode)) { 5099 shmem_unacct_size(flags, size); 5100 return ERR_CAST(inode); 5101 } 5102 inode->i_flags |= i_flags; 5103 inode->i_size = size; 5104 clear_nlink(inode); /* It is unlinked */ 5105 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); 5106 if (!IS_ERR(res)) 5107 res = alloc_file_pseudo(inode, mnt, name, O_RDWR, 5108 &shmem_file_operations); 5109 if (IS_ERR(res)) 5110 iput(inode); 5111 return res; 5112 } 5113 5114 /** 5115 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be 5116 * kernel internal. There will be NO LSM permission checks against the 5117 * underlying inode. So users of this interface must do LSM checks at a 5118 * higher layer. The users are the big_key and shm implementations. LSM 5119 * checks are provided at the key or shm level rather than the inode. 5120 * @name: name for dentry (to be seen in /proc/<pid>/maps 5121 * @size: size to be set for the file 5122 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 5123 */ 5124 struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) 5125 { 5126 return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); 5127 } 5128 EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); 5129 5130 /** 5131 * shmem_file_setup - get an unlinked file living in tmpfs 5132 * @name: name for dentry (to be seen in /proc/<pid>/maps 5133 * @size: size to be set for the file 5134 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 5135 */ 5136 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 5137 { 5138 return __shmem_file_setup(shm_mnt, name, size, flags, 0); 5139 } 5140 EXPORT_SYMBOL_GPL(shmem_file_setup); 5141 5142 /** 5143 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs 5144 * @mnt: the tmpfs mount where the file will be created 5145 * @name: name for dentry (to be seen in /proc/<pid>/maps 5146 * @size: size to be set for the file 5147 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 5148 */ 5149 struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, 5150 loff_t size, unsigned long flags) 5151 { 5152 return __shmem_file_setup(mnt, name, size, flags, 0); 5153 } 5154 EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); 5155 5156 /** 5157 * shmem_zero_setup - setup a shared anonymous mapping 5158 * @vma: the vma to be mmapped is prepared by do_mmap 5159 */ 5160 int shmem_zero_setup(struct vm_area_struct *vma) 5161 { 5162 struct file *file; 5163 loff_t size = vma->vm_end - vma->vm_start; 5164 5165 /* 5166 * Cloning a new file under mmap_lock leads to a lock ordering conflict 5167 * between XFS directory reading and selinux: since this file is only 5168 * accessible to the user through its mapping, use S_PRIVATE flag to 5169 * bypass file security, in the same way as shmem_kernel_file_setup(). 5170 */ 5171 file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); 5172 if (IS_ERR(file)) 5173 return PTR_ERR(file); 5174 5175 if (vma->vm_file) 5176 fput(vma->vm_file); 5177 vma->vm_file = file; 5178 vma->vm_ops = &shmem_anon_vm_ops; 5179 5180 return 0; 5181 } 5182 5183 /** 5184 * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. 5185 * @mapping: the folio's address_space 5186 * @index: the folio index 5187 * @gfp: the page allocator flags to use if allocating 5188 * 5189 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 5190 * with any new page allocations done using the specified allocation flags. 5191 * But read_cache_page_gfp() uses the ->read_folio() method: which does not 5192 * suit tmpfs, since it may have pages in swapcache, and needs to find those 5193 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 5194 * 5195 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 5196 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 5197 */ 5198 struct folio *shmem_read_folio_gfp(struct address_space *mapping, 5199 pgoff_t index, gfp_t gfp) 5200 { 5201 #ifdef CONFIG_SHMEM 5202 struct inode *inode = mapping->host; 5203 struct folio *folio; 5204 int error; 5205 5206 error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, 5207 gfp, NULL, NULL); 5208 if (error) 5209 return ERR_PTR(error); 5210 5211 folio_unlock(folio); 5212 return folio; 5213 #else 5214 /* 5215 * The tiny !SHMEM case uses ramfs without swap 5216 */ 5217 return mapping_read_folio_gfp(mapping, index, gfp); 5218 #endif 5219 } 5220 EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); 5221 5222 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 5223 pgoff_t index, gfp_t gfp) 5224 { 5225 struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); 5226 struct page *page; 5227 5228 if (IS_ERR(folio)) 5229 return &folio->page; 5230 5231 page = folio_file_page(folio, index); 5232 if (PageHWPoison(page)) { 5233 folio_put(folio); 5234 return ERR_PTR(-EIO); 5235 } 5236 5237 return page; 5238 } 5239 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 5240