1 /* 2 * Resizable virtual memory filesystem for Linux. 3 * 4 * Copyright (C) 2000 Linus Torvalds. 5 * 2000 Transmeta Corp. 6 * 2000-2001 Christoph Rohland 7 * 2000-2001 SAP AG 8 * 2002 Red Hat Inc. 9 * Copyright (C) 2002-2011 Hugh Dickins. 10 * Copyright (C) 2011 Google Inc. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation. 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs 13 * 14 * Extended attribute support for tmpfs: 15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 17 * 18 * tiny-shmem: 19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 20 * 21 * This file is released under the GPL. 22 */ 23 24 #include <linux/fs.h> 25 #include <linux/init.h> 26 #include <linux/vfs.h> 27 #include <linux/mount.h> 28 #include <linux/ramfs.h> 29 #include <linux/pagemap.h> 30 #include <linux/file.h> 31 #include <linux/fileattr.h> 32 #include <linux/mm.h> 33 #include <linux/random.h> 34 #include <linux/sched/signal.h> 35 #include <linux/export.h> 36 #include <linux/shmem_fs.h> 37 #include <linux/swap.h> 38 #include <linux/uio.h> 39 #include <linux/hugetlb.h> 40 #include <linux/fs_parser.h> 41 #include <linux/swapfile.h> 42 #include <linux/iversion.h> 43 #include "swap.h" 44 45 static struct vfsmount *shm_mnt __ro_after_init; 46 47 #ifdef CONFIG_SHMEM 48 /* 49 * This virtual memory filesystem is heavily based on the ramfs. It 50 * extends ramfs by the ability to use swap and honor resource limits 51 * which makes it a completely usable filesystem. 52 */ 53 54 #include <linux/xattr.h> 55 #include <linux/exportfs.h> 56 #include <linux/posix_acl.h> 57 #include <linux/posix_acl_xattr.h> 58 #include <linux/mman.h> 59 #include <linux/string.h> 60 #include <linux/slab.h> 61 #include <linux/backing-dev.h> 62 #include <linux/writeback.h> 63 #include <linux/pagevec.h> 64 #include <linux/percpu_counter.h> 65 #include <linux/falloc.h> 66 #include <linux/splice.h> 67 #include <linux/security.h> 68 #include <linux/swapops.h> 69 #include <linux/mempolicy.h> 70 #include <linux/namei.h> 71 #include <linux/ctype.h> 72 #include <linux/migrate.h> 73 #include <linux/highmem.h> 74 #include <linux/seq_file.h> 75 #include <linux/magic.h> 76 #include <linux/syscalls.h> 77 #include <linux/fcntl.h> 78 #include <uapi/linux/memfd.h> 79 #include <linux/rmap.h> 80 #include <linux/uuid.h> 81 #include <linux/quotaops.h> 82 83 #include <linux/uaccess.h> 84 85 #include "internal.h" 86 87 #define BLOCKS_PER_PAGE (PAGE_SIZE/512) 88 #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) 89 90 /* Pretend that each entry is of this size in directory's i_size */ 91 #define BOGO_DIRENT_SIZE 20 92 93 /* Pretend that one inode + its dentry occupy this much memory */ 94 #define BOGO_INODE_SIZE 1024 95 96 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 97 #define SHORT_SYMLINK_LEN 128 98 99 /* 100 * shmem_fallocate communicates with shmem_fault or shmem_writepage via 101 * inode->i_private (with i_rwsem making sure that it has only one user at 102 * a time): we would prefer not to enlarge the shmem inode just for that. 103 */ 104 struct shmem_falloc { 105 wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ 106 pgoff_t start; /* start of range currently being fallocated */ 107 pgoff_t next; /* the next page offset to be fallocated */ 108 pgoff_t nr_falloced; /* how many new pages have been fallocated */ 109 pgoff_t nr_unswapped; /* how often writepage refused to swap out */ 110 }; 111 112 struct shmem_options { 113 unsigned long long blocks; 114 unsigned long long inodes; 115 struct mempolicy *mpol; 116 kuid_t uid; 117 kgid_t gid; 118 umode_t mode; 119 bool full_inums; 120 int huge; 121 int seen; 122 bool noswap; 123 unsigned short quota_types; 124 struct shmem_quota_limits qlimits; 125 #define SHMEM_SEEN_BLOCKS 1 126 #define SHMEM_SEEN_INODES 2 127 #define SHMEM_SEEN_HUGE 4 128 #define SHMEM_SEEN_INUMS 8 129 #define SHMEM_SEEN_NOSWAP 16 130 #define SHMEM_SEEN_QUOTA 32 131 }; 132 133 #ifdef CONFIG_TMPFS 134 static unsigned long shmem_default_max_blocks(void) 135 { 136 return totalram_pages() / 2; 137 } 138 139 static unsigned long shmem_default_max_inodes(void) 140 { 141 unsigned long nr_pages = totalram_pages(); 142 143 return min3(nr_pages - totalhigh_pages(), nr_pages / 2, 144 ULONG_MAX / BOGO_INODE_SIZE); 145 } 146 #endif 147 148 static int shmem_swapin_folio(struct inode *inode, pgoff_t index, 149 struct folio **foliop, enum sgp_type sgp, gfp_t gfp, 150 struct mm_struct *fault_mm, vm_fault_t *fault_type); 151 152 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 153 { 154 return sb->s_fs_info; 155 } 156 157 /* 158 * shmem_file_setup pre-accounts the whole fixed size of a VM object, 159 * for shared memory and for shared anonymous (/dev/zero) mappings 160 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 161 * consistent with the pre-accounting of private mappings ... 162 */ 163 static inline int shmem_acct_size(unsigned long flags, loff_t size) 164 { 165 return (flags & VM_NORESERVE) ? 166 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); 167 } 168 169 static inline void shmem_unacct_size(unsigned long flags, loff_t size) 170 { 171 if (!(flags & VM_NORESERVE)) 172 vm_unacct_memory(VM_ACCT(size)); 173 } 174 175 static inline int shmem_reacct_size(unsigned long flags, 176 loff_t oldsize, loff_t newsize) 177 { 178 if (!(flags & VM_NORESERVE)) { 179 if (VM_ACCT(newsize) > VM_ACCT(oldsize)) 180 return security_vm_enough_memory_mm(current->mm, 181 VM_ACCT(newsize) - VM_ACCT(oldsize)); 182 else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) 183 vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); 184 } 185 return 0; 186 } 187 188 /* 189 * ... whereas tmpfs objects are accounted incrementally as 190 * pages are allocated, in order to allow large sparse files. 191 * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, 192 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 193 */ 194 static inline int shmem_acct_blocks(unsigned long flags, long pages) 195 { 196 if (!(flags & VM_NORESERVE)) 197 return 0; 198 199 return security_vm_enough_memory_mm(current->mm, 200 pages * VM_ACCT(PAGE_SIZE)); 201 } 202 203 static inline void shmem_unacct_blocks(unsigned long flags, long pages) 204 { 205 if (flags & VM_NORESERVE) 206 vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); 207 } 208 209 static int shmem_inode_acct_blocks(struct inode *inode, long pages) 210 { 211 struct shmem_inode_info *info = SHMEM_I(inode); 212 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 213 int err = -ENOSPC; 214 215 if (shmem_acct_blocks(info->flags, pages)) 216 return err; 217 218 might_sleep(); /* when quotas */ 219 if (sbinfo->max_blocks) { 220 if (!percpu_counter_limited_add(&sbinfo->used_blocks, 221 sbinfo->max_blocks, pages)) 222 goto unacct; 223 224 err = dquot_alloc_block_nodirty(inode, pages); 225 if (err) { 226 percpu_counter_sub(&sbinfo->used_blocks, pages); 227 goto unacct; 228 } 229 } else { 230 err = dquot_alloc_block_nodirty(inode, pages); 231 if (err) 232 goto unacct; 233 } 234 235 return 0; 236 237 unacct: 238 shmem_unacct_blocks(info->flags, pages); 239 return err; 240 } 241 242 static void shmem_inode_unacct_blocks(struct inode *inode, long pages) 243 { 244 struct shmem_inode_info *info = SHMEM_I(inode); 245 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 246 247 might_sleep(); /* when quotas */ 248 dquot_free_block_nodirty(inode, pages); 249 250 if (sbinfo->max_blocks) 251 percpu_counter_sub(&sbinfo->used_blocks, pages); 252 shmem_unacct_blocks(info->flags, pages); 253 } 254 255 static const struct super_operations shmem_ops; 256 const struct address_space_operations shmem_aops; 257 static const struct file_operations shmem_file_operations; 258 static const struct inode_operations shmem_inode_operations; 259 static const struct inode_operations shmem_dir_inode_operations; 260 static const struct inode_operations shmem_special_inode_operations; 261 static const struct vm_operations_struct shmem_vm_ops; 262 static const struct vm_operations_struct shmem_anon_vm_ops; 263 static struct file_system_type shmem_fs_type; 264 265 bool vma_is_anon_shmem(struct vm_area_struct *vma) 266 { 267 return vma->vm_ops == &shmem_anon_vm_ops; 268 } 269 270 bool vma_is_shmem(struct vm_area_struct *vma) 271 { 272 return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; 273 } 274 275 static LIST_HEAD(shmem_swaplist); 276 static DEFINE_MUTEX(shmem_swaplist_mutex); 277 278 #ifdef CONFIG_TMPFS_QUOTA 279 280 static int shmem_enable_quotas(struct super_block *sb, 281 unsigned short quota_types) 282 { 283 int type, err = 0; 284 285 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; 286 for (type = 0; type < SHMEM_MAXQUOTAS; type++) { 287 if (!(quota_types & (1 << type))) 288 continue; 289 err = dquot_load_quota_sb(sb, type, QFMT_SHMEM, 290 DQUOT_USAGE_ENABLED | 291 DQUOT_LIMITS_ENABLED); 292 if (err) 293 goto out_err; 294 } 295 return 0; 296 297 out_err: 298 pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n", 299 type, err); 300 for (type--; type >= 0; type--) 301 dquot_quota_off(sb, type); 302 return err; 303 } 304 305 static void shmem_disable_quotas(struct super_block *sb) 306 { 307 int type; 308 309 for (type = 0; type < SHMEM_MAXQUOTAS; type++) 310 dquot_quota_off(sb, type); 311 } 312 313 static struct dquot **shmem_get_dquots(struct inode *inode) 314 { 315 return SHMEM_I(inode)->i_dquot; 316 } 317 #endif /* CONFIG_TMPFS_QUOTA */ 318 319 /* 320 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and 321 * produces a novel ino for the newly allocated inode. 322 * 323 * It may also be called when making a hard link to permit the space needed by 324 * each dentry. However, in that case, no new inode number is needed since that 325 * internally draws from another pool of inode numbers (currently global 326 * get_next_ino()). This case is indicated by passing NULL as inop. 327 */ 328 #define SHMEM_INO_BATCH 1024 329 static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) 330 { 331 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 332 ino_t ino; 333 334 if (!(sb->s_flags & SB_KERNMOUNT)) { 335 raw_spin_lock(&sbinfo->stat_lock); 336 if (sbinfo->max_inodes) { 337 if (sbinfo->free_ispace < BOGO_INODE_SIZE) { 338 raw_spin_unlock(&sbinfo->stat_lock); 339 return -ENOSPC; 340 } 341 sbinfo->free_ispace -= BOGO_INODE_SIZE; 342 } 343 if (inop) { 344 ino = sbinfo->next_ino++; 345 if (unlikely(is_zero_ino(ino))) 346 ino = sbinfo->next_ino++; 347 if (unlikely(!sbinfo->full_inums && 348 ino > UINT_MAX)) { 349 /* 350 * Emulate get_next_ino uint wraparound for 351 * compatibility 352 */ 353 if (IS_ENABLED(CONFIG_64BIT)) 354 pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", 355 __func__, MINOR(sb->s_dev)); 356 sbinfo->next_ino = 1; 357 ino = sbinfo->next_ino++; 358 } 359 *inop = ino; 360 } 361 raw_spin_unlock(&sbinfo->stat_lock); 362 } else if (inop) { 363 /* 364 * __shmem_file_setup, one of our callers, is lock-free: it 365 * doesn't hold stat_lock in shmem_reserve_inode since 366 * max_inodes is always 0, and is called from potentially 367 * unknown contexts. As such, use a per-cpu batched allocator 368 * which doesn't require the per-sb stat_lock unless we are at 369 * the batch boundary. 370 * 371 * We don't need to worry about inode{32,64} since SB_KERNMOUNT 372 * shmem mounts are not exposed to userspace, so we don't need 373 * to worry about things like glibc compatibility. 374 */ 375 ino_t *next_ino; 376 377 next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); 378 ino = *next_ino; 379 if (unlikely(ino % SHMEM_INO_BATCH == 0)) { 380 raw_spin_lock(&sbinfo->stat_lock); 381 ino = sbinfo->next_ino; 382 sbinfo->next_ino += SHMEM_INO_BATCH; 383 raw_spin_unlock(&sbinfo->stat_lock); 384 if (unlikely(is_zero_ino(ino))) 385 ino++; 386 } 387 *inop = ino; 388 *next_ino = ++ino; 389 put_cpu(); 390 } 391 392 return 0; 393 } 394 395 static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) 396 { 397 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 398 if (sbinfo->max_inodes) { 399 raw_spin_lock(&sbinfo->stat_lock); 400 sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace; 401 raw_spin_unlock(&sbinfo->stat_lock); 402 } 403 } 404 405 /** 406 * shmem_recalc_inode - recalculate the block usage of an inode 407 * @inode: inode to recalc 408 * @alloced: the change in number of pages allocated to inode 409 * @swapped: the change in number of pages swapped from inode 410 * 411 * We have to calculate the free blocks since the mm can drop 412 * undirtied hole pages behind our back. 413 * 414 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 415 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 416 */ 417 static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) 418 { 419 struct shmem_inode_info *info = SHMEM_I(inode); 420 long freed; 421 422 spin_lock(&info->lock); 423 info->alloced += alloced; 424 info->swapped += swapped; 425 freed = info->alloced - info->swapped - 426 READ_ONCE(inode->i_mapping->nrpages); 427 /* 428 * Special case: whereas normally shmem_recalc_inode() is called 429 * after i_mapping->nrpages has already been adjusted (up or down), 430 * shmem_writepage() has to raise swapped before nrpages is lowered - 431 * to stop a racing shmem_recalc_inode() from thinking that a page has 432 * been freed. Compensate here, to avoid the need for a followup call. 433 */ 434 if (swapped > 0) 435 freed += swapped; 436 if (freed > 0) 437 info->alloced -= freed; 438 spin_unlock(&info->lock); 439 440 /* The quota case may block */ 441 if (freed > 0) 442 shmem_inode_unacct_blocks(inode, freed); 443 } 444 445 bool shmem_charge(struct inode *inode, long pages) 446 { 447 struct address_space *mapping = inode->i_mapping; 448 449 if (shmem_inode_acct_blocks(inode, pages)) 450 return false; 451 452 /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ 453 xa_lock_irq(&mapping->i_pages); 454 mapping->nrpages += pages; 455 xa_unlock_irq(&mapping->i_pages); 456 457 shmem_recalc_inode(inode, pages, 0); 458 return true; 459 } 460 461 void shmem_uncharge(struct inode *inode, long pages) 462 { 463 /* pages argument is currently unused: keep it to help debugging */ 464 /* nrpages adjustment done by __filemap_remove_folio() or caller */ 465 466 shmem_recalc_inode(inode, 0, 0); 467 } 468 469 /* 470 * Replace item expected in xarray by a new item, while holding xa_lock. 471 */ 472 static int shmem_replace_entry(struct address_space *mapping, 473 pgoff_t index, void *expected, void *replacement) 474 { 475 XA_STATE(xas, &mapping->i_pages, index); 476 void *item; 477 478 VM_BUG_ON(!expected); 479 VM_BUG_ON(!replacement); 480 item = xas_load(&xas); 481 if (item != expected) 482 return -ENOENT; 483 xas_store(&xas, replacement); 484 return 0; 485 } 486 487 /* 488 * Sometimes, before we decide whether to proceed or to fail, we must check 489 * that an entry was not already brought back from swap by a racing thread. 490 * 491 * Checking page is not enough: by the time a SwapCache page is locked, it 492 * might be reused, and again be SwapCache, using the same swap as before. 493 */ 494 static bool shmem_confirm_swap(struct address_space *mapping, 495 pgoff_t index, swp_entry_t swap) 496 { 497 return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); 498 } 499 500 /* 501 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option 502 * 503 * SHMEM_HUGE_NEVER: 504 * disables huge pages for the mount; 505 * SHMEM_HUGE_ALWAYS: 506 * enables huge pages for the mount; 507 * SHMEM_HUGE_WITHIN_SIZE: 508 * only allocate huge pages if the page will be fully within i_size, 509 * also respect fadvise()/madvise() hints; 510 * SHMEM_HUGE_ADVISE: 511 * only allocate huge pages if requested with fadvise()/madvise(); 512 */ 513 514 #define SHMEM_HUGE_NEVER 0 515 #define SHMEM_HUGE_ALWAYS 1 516 #define SHMEM_HUGE_WITHIN_SIZE 2 517 #define SHMEM_HUGE_ADVISE 3 518 519 /* 520 * Special values. 521 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: 522 * 523 * SHMEM_HUGE_DENY: 524 * disables huge on shm_mnt and all mounts, for emergency use; 525 * SHMEM_HUGE_FORCE: 526 * enables huge on shm_mnt and all mounts, w/o needing option, for testing; 527 * 528 */ 529 #define SHMEM_HUGE_DENY (-1) 530 #define SHMEM_HUGE_FORCE (-2) 531 532 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 533 /* ifdef here to avoid bloating shmem.o when not necessary */ 534 535 static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; 536 537 bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, 538 struct mm_struct *mm, unsigned long vm_flags) 539 { 540 loff_t i_size; 541 542 if (!S_ISREG(inode->i_mode)) 543 return false; 544 if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags))) 545 return false; 546 if (shmem_huge == SHMEM_HUGE_DENY) 547 return false; 548 if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) 549 return true; 550 551 switch (SHMEM_SB(inode->i_sb)->huge) { 552 case SHMEM_HUGE_ALWAYS: 553 return true; 554 case SHMEM_HUGE_WITHIN_SIZE: 555 index = round_up(index + 1, HPAGE_PMD_NR); 556 i_size = round_up(i_size_read(inode), PAGE_SIZE); 557 if (i_size >> PAGE_SHIFT >= index) 558 return true; 559 fallthrough; 560 case SHMEM_HUGE_ADVISE: 561 if (mm && (vm_flags & VM_HUGEPAGE)) 562 return true; 563 fallthrough; 564 default: 565 return false; 566 } 567 } 568 569 #if defined(CONFIG_SYSFS) 570 static int shmem_parse_huge(const char *str) 571 { 572 if (!strcmp(str, "never")) 573 return SHMEM_HUGE_NEVER; 574 if (!strcmp(str, "always")) 575 return SHMEM_HUGE_ALWAYS; 576 if (!strcmp(str, "within_size")) 577 return SHMEM_HUGE_WITHIN_SIZE; 578 if (!strcmp(str, "advise")) 579 return SHMEM_HUGE_ADVISE; 580 if (!strcmp(str, "deny")) 581 return SHMEM_HUGE_DENY; 582 if (!strcmp(str, "force")) 583 return SHMEM_HUGE_FORCE; 584 return -EINVAL; 585 } 586 #endif 587 588 #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) 589 static const char *shmem_format_huge(int huge) 590 { 591 switch (huge) { 592 case SHMEM_HUGE_NEVER: 593 return "never"; 594 case SHMEM_HUGE_ALWAYS: 595 return "always"; 596 case SHMEM_HUGE_WITHIN_SIZE: 597 return "within_size"; 598 case SHMEM_HUGE_ADVISE: 599 return "advise"; 600 case SHMEM_HUGE_DENY: 601 return "deny"; 602 case SHMEM_HUGE_FORCE: 603 return "force"; 604 default: 605 VM_BUG_ON(1); 606 return "bad_val"; 607 } 608 } 609 #endif 610 611 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 612 struct shrink_control *sc, unsigned long nr_to_split) 613 { 614 LIST_HEAD(list), *pos, *next; 615 LIST_HEAD(to_remove); 616 struct inode *inode; 617 struct shmem_inode_info *info; 618 struct folio *folio; 619 unsigned long batch = sc ? sc->nr_to_scan : 128; 620 int split = 0; 621 622 if (list_empty(&sbinfo->shrinklist)) 623 return SHRINK_STOP; 624 625 spin_lock(&sbinfo->shrinklist_lock); 626 list_for_each_safe(pos, next, &sbinfo->shrinklist) { 627 info = list_entry(pos, struct shmem_inode_info, shrinklist); 628 629 /* pin the inode */ 630 inode = igrab(&info->vfs_inode); 631 632 /* inode is about to be evicted */ 633 if (!inode) { 634 list_del_init(&info->shrinklist); 635 goto next; 636 } 637 638 /* Check if there's anything to gain */ 639 if (round_up(inode->i_size, PAGE_SIZE) == 640 round_up(inode->i_size, HPAGE_PMD_SIZE)) { 641 list_move(&info->shrinklist, &to_remove); 642 goto next; 643 } 644 645 list_move(&info->shrinklist, &list); 646 next: 647 sbinfo->shrinklist_len--; 648 if (!--batch) 649 break; 650 } 651 spin_unlock(&sbinfo->shrinklist_lock); 652 653 list_for_each_safe(pos, next, &to_remove) { 654 info = list_entry(pos, struct shmem_inode_info, shrinklist); 655 inode = &info->vfs_inode; 656 list_del_init(&info->shrinklist); 657 iput(inode); 658 } 659 660 list_for_each_safe(pos, next, &list) { 661 int ret; 662 pgoff_t index; 663 664 info = list_entry(pos, struct shmem_inode_info, shrinklist); 665 inode = &info->vfs_inode; 666 667 if (nr_to_split && split >= nr_to_split) 668 goto move_back; 669 670 index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; 671 folio = filemap_get_folio(inode->i_mapping, index); 672 if (IS_ERR(folio)) 673 goto drop; 674 675 /* No huge page at the end of the file: nothing to split */ 676 if (!folio_test_large(folio)) { 677 folio_put(folio); 678 goto drop; 679 } 680 681 /* 682 * Move the inode on the list back to shrinklist if we failed 683 * to lock the page at this time. 684 * 685 * Waiting for the lock may lead to deadlock in the 686 * reclaim path. 687 */ 688 if (!folio_trylock(folio)) { 689 folio_put(folio); 690 goto move_back; 691 } 692 693 ret = split_folio(folio); 694 folio_unlock(folio); 695 folio_put(folio); 696 697 /* If split failed move the inode on the list back to shrinklist */ 698 if (ret) 699 goto move_back; 700 701 split++; 702 drop: 703 list_del_init(&info->shrinklist); 704 goto put; 705 move_back: 706 /* 707 * Make sure the inode is either on the global list or deleted 708 * from any local list before iput() since it could be deleted 709 * in another thread once we put the inode (then the local list 710 * is corrupted). 711 */ 712 spin_lock(&sbinfo->shrinklist_lock); 713 list_move(&info->shrinklist, &sbinfo->shrinklist); 714 sbinfo->shrinklist_len++; 715 spin_unlock(&sbinfo->shrinklist_lock); 716 put: 717 iput(inode); 718 } 719 720 return split; 721 } 722 723 static long shmem_unused_huge_scan(struct super_block *sb, 724 struct shrink_control *sc) 725 { 726 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 727 728 if (!READ_ONCE(sbinfo->shrinklist_len)) 729 return SHRINK_STOP; 730 731 return shmem_unused_huge_shrink(sbinfo, sc, 0); 732 } 733 734 static long shmem_unused_huge_count(struct super_block *sb, 735 struct shrink_control *sc) 736 { 737 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 738 return READ_ONCE(sbinfo->shrinklist_len); 739 } 740 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ 741 742 #define shmem_huge SHMEM_HUGE_DENY 743 744 bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, 745 struct mm_struct *mm, unsigned long vm_flags) 746 { 747 return false; 748 } 749 750 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 751 struct shrink_control *sc, unsigned long nr_to_split) 752 { 753 return 0; 754 } 755 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 756 757 /* 758 * Somewhat like filemap_add_folio, but error if expected item has gone. 759 */ 760 static int shmem_add_to_page_cache(struct folio *folio, 761 struct address_space *mapping, 762 pgoff_t index, void *expected, gfp_t gfp) 763 { 764 XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); 765 long nr = folio_nr_pages(folio); 766 767 VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); 768 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 769 VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); 770 VM_BUG_ON(expected && folio_test_large(folio)); 771 772 folio_ref_add(folio, nr); 773 folio->mapping = mapping; 774 folio->index = index; 775 776 gfp &= GFP_RECLAIM_MASK; 777 folio_throttle_swaprate(folio, gfp); 778 779 do { 780 xas_lock_irq(&xas); 781 if (expected != xas_find_conflict(&xas)) { 782 xas_set_err(&xas, -EEXIST); 783 goto unlock; 784 } 785 if (expected && xas_find_conflict(&xas)) { 786 xas_set_err(&xas, -EEXIST); 787 goto unlock; 788 } 789 xas_store(&xas, folio); 790 if (xas_error(&xas)) 791 goto unlock; 792 if (folio_test_pmd_mappable(folio)) 793 __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); 794 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); 795 __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); 796 mapping->nrpages += nr; 797 unlock: 798 xas_unlock_irq(&xas); 799 } while (xas_nomem(&xas, gfp)); 800 801 if (xas_error(&xas)) { 802 folio->mapping = NULL; 803 folio_ref_sub(folio, nr); 804 return xas_error(&xas); 805 } 806 807 return 0; 808 } 809 810 /* 811 * Somewhat like filemap_remove_folio, but substitutes swap for @folio. 812 */ 813 static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) 814 { 815 struct address_space *mapping = folio->mapping; 816 long nr = folio_nr_pages(folio); 817 int error; 818 819 xa_lock_irq(&mapping->i_pages); 820 error = shmem_replace_entry(mapping, folio->index, folio, radswap); 821 folio->mapping = NULL; 822 mapping->nrpages -= nr; 823 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 824 __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); 825 xa_unlock_irq(&mapping->i_pages); 826 folio_put(folio); 827 BUG_ON(error); 828 } 829 830 /* 831 * Remove swap entry from page cache, free the swap and its page cache. 832 */ 833 static int shmem_free_swap(struct address_space *mapping, 834 pgoff_t index, void *radswap) 835 { 836 void *old; 837 838 old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); 839 if (old != radswap) 840 return -ENOENT; 841 free_swap_and_cache(radix_to_swp_entry(radswap)); 842 return 0; 843 } 844 845 /* 846 * Determine (in bytes) how many of the shmem object's pages mapped by the 847 * given offsets are swapped out. 848 * 849 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, 850 * as long as the inode doesn't go away and racy results are not a problem. 851 */ 852 unsigned long shmem_partial_swap_usage(struct address_space *mapping, 853 pgoff_t start, pgoff_t end) 854 { 855 XA_STATE(xas, &mapping->i_pages, start); 856 struct page *page; 857 unsigned long swapped = 0; 858 unsigned long max = end - 1; 859 860 rcu_read_lock(); 861 xas_for_each(&xas, page, max) { 862 if (xas_retry(&xas, page)) 863 continue; 864 if (xa_is_value(page)) 865 swapped++; 866 if (xas.xa_index == max) 867 break; 868 if (need_resched()) { 869 xas_pause(&xas); 870 cond_resched_rcu(); 871 } 872 } 873 rcu_read_unlock(); 874 875 return swapped << PAGE_SHIFT; 876 } 877 878 /* 879 * Determine (in bytes) how many of the shmem object's pages mapped by the 880 * given vma is swapped out. 881 * 882 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, 883 * as long as the inode doesn't go away and racy results are not a problem. 884 */ 885 unsigned long shmem_swap_usage(struct vm_area_struct *vma) 886 { 887 struct inode *inode = file_inode(vma->vm_file); 888 struct shmem_inode_info *info = SHMEM_I(inode); 889 struct address_space *mapping = inode->i_mapping; 890 unsigned long swapped; 891 892 /* Be careful as we don't hold info->lock */ 893 swapped = READ_ONCE(info->swapped); 894 895 /* 896 * The easier cases are when the shmem object has nothing in swap, or 897 * the vma maps it whole. Then we can simply use the stats that we 898 * already track. 899 */ 900 if (!swapped) 901 return 0; 902 903 if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) 904 return swapped << PAGE_SHIFT; 905 906 /* Here comes the more involved part */ 907 return shmem_partial_swap_usage(mapping, vma->vm_pgoff, 908 vma->vm_pgoff + vma_pages(vma)); 909 } 910 911 /* 912 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 913 */ 914 void shmem_unlock_mapping(struct address_space *mapping) 915 { 916 struct folio_batch fbatch; 917 pgoff_t index = 0; 918 919 folio_batch_init(&fbatch); 920 /* 921 * Minor point, but we might as well stop if someone else SHM_LOCKs it. 922 */ 923 while (!mapping_unevictable(mapping) && 924 filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { 925 check_move_unevictable_folios(&fbatch); 926 folio_batch_release(&fbatch); 927 cond_resched(); 928 } 929 } 930 931 static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) 932 { 933 struct folio *folio; 934 935 /* 936 * At first avoid shmem_get_folio(,,,SGP_READ): that fails 937 * beyond i_size, and reports fallocated folios as holes. 938 */ 939 folio = filemap_get_entry(inode->i_mapping, index); 940 if (!folio) 941 return folio; 942 if (!xa_is_value(folio)) { 943 folio_lock(folio); 944 if (folio->mapping == inode->i_mapping) 945 return folio; 946 /* The folio has been swapped out */ 947 folio_unlock(folio); 948 folio_put(folio); 949 } 950 /* 951 * But read a folio back from swap if any of it is within i_size 952 * (although in some cases this is just a waste of time). 953 */ 954 folio = NULL; 955 shmem_get_folio(inode, index, &folio, SGP_READ); 956 return folio; 957 } 958 959 /* 960 * Remove range of pages and swap entries from page cache, and free them. 961 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. 962 */ 963 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, 964 bool unfalloc) 965 { 966 struct address_space *mapping = inode->i_mapping; 967 struct shmem_inode_info *info = SHMEM_I(inode); 968 pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; 969 pgoff_t end = (lend + 1) >> PAGE_SHIFT; 970 struct folio_batch fbatch; 971 pgoff_t indices[PAGEVEC_SIZE]; 972 struct folio *folio; 973 bool same_folio; 974 long nr_swaps_freed = 0; 975 pgoff_t index; 976 int i; 977 978 if (lend == -1) 979 end = -1; /* unsigned, so actually very big */ 980 981 if (info->fallocend > start && info->fallocend <= end && !unfalloc) 982 info->fallocend = start; 983 984 folio_batch_init(&fbatch); 985 index = start; 986 while (index < end && find_lock_entries(mapping, &index, end - 1, 987 &fbatch, indices)) { 988 for (i = 0; i < folio_batch_count(&fbatch); i++) { 989 folio = fbatch.folios[i]; 990 991 if (xa_is_value(folio)) { 992 if (unfalloc) 993 continue; 994 nr_swaps_freed += !shmem_free_swap(mapping, 995 indices[i], folio); 996 continue; 997 } 998 999 if (!unfalloc || !folio_test_uptodate(folio)) 1000 truncate_inode_folio(mapping, folio); 1001 folio_unlock(folio); 1002 } 1003 folio_batch_remove_exceptionals(&fbatch); 1004 folio_batch_release(&fbatch); 1005 cond_resched(); 1006 } 1007 1008 /* 1009 * When undoing a failed fallocate, we want none of the partial folio 1010 * zeroing and splitting below, but shall want to truncate the whole 1011 * folio when !uptodate indicates that it was added by this fallocate, 1012 * even when [lstart, lend] covers only a part of the folio. 1013 */ 1014 if (unfalloc) 1015 goto whole_folios; 1016 1017 same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); 1018 folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); 1019 if (folio) { 1020 same_folio = lend < folio_pos(folio) + folio_size(folio); 1021 folio_mark_dirty(folio); 1022 if (!truncate_inode_partial_folio(folio, lstart, lend)) { 1023 start = folio_next_index(folio); 1024 if (same_folio) 1025 end = folio->index; 1026 } 1027 folio_unlock(folio); 1028 folio_put(folio); 1029 folio = NULL; 1030 } 1031 1032 if (!same_folio) 1033 folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); 1034 if (folio) { 1035 folio_mark_dirty(folio); 1036 if (!truncate_inode_partial_folio(folio, lstart, lend)) 1037 end = folio->index; 1038 folio_unlock(folio); 1039 folio_put(folio); 1040 } 1041 1042 whole_folios: 1043 1044 index = start; 1045 while (index < end) { 1046 cond_resched(); 1047 1048 if (!find_get_entries(mapping, &index, end - 1, &fbatch, 1049 indices)) { 1050 /* If all gone or hole-punch or unfalloc, we're done */ 1051 if (index == start || end != -1) 1052 break; 1053 /* But if truncating, restart to make sure all gone */ 1054 index = start; 1055 continue; 1056 } 1057 for (i = 0; i < folio_batch_count(&fbatch); i++) { 1058 folio = fbatch.folios[i]; 1059 1060 if (xa_is_value(folio)) { 1061 if (unfalloc) 1062 continue; 1063 if (shmem_free_swap(mapping, indices[i], folio)) { 1064 /* Swap was replaced by page: retry */ 1065 index = indices[i]; 1066 break; 1067 } 1068 nr_swaps_freed++; 1069 continue; 1070 } 1071 1072 folio_lock(folio); 1073 1074 if (!unfalloc || !folio_test_uptodate(folio)) { 1075 if (folio_mapping(folio) != mapping) { 1076 /* Page was replaced by swap: retry */ 1077 folio_unlock(folio); 1078 index = indices[i]; 1079 break; 1080 } 1081 VM_BUG_ON_FOLIO(folio_test_writeback(folio), 1082 folio); 1083 truncate_inode_folio(mapping, folio); 1084 } 1085 folio_unlock(folio); 1086 } 1087 folio_batch_remove_exceptionals(&fbatch); 1088 folio_batch_release(&fbatch); 1089 } 1090 1091 shmem_recalc_inode(inode, 0, -nr_swaps_freed); 1092 } 1093 1094 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 1095 { 1096 shmem_undo_range(inode, lstart, lend, false); 1097 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 1098 inode_inc_iversion(inode); 1099 } 1100 EXPORT_SYMBOL_GPL(shmem_truncate_range); 1101 1102 static int shmem_getattr(struct mnt_idmap *idmap, 1103 const struct path *path, struct kstat *stat, 1104 u32 request_mask, unsigned int query_flags) 1105 { 1106 struct inode *inode = path->dentry->d_inode; 1107 struct shmem_inode_info *info = SHMEM_I(inode); 1108 1109 if (info->alloced - info->swapped != inode->i_mapping->nrpages) 1110 shmem_recalc_inode(inode, 0, 0); 1111 1112 if (info->fsflags & FS_APPEND_FL) 1113 stat->attributes |= STATX_ATTR_APPEND; 1114 if (info->fsflags & FS_IMMUTABLE_FL) 1115 stat->attributes |= STATX_ATTR_IMMUTABLE; 1116 if (info->fsflags & FS_NODUMP_FL) 1117 stat->attributes |= STATX_ATTR_NODUMP; 1118 stat->attributes_mask |= (STATX_ATTR_APPEND | 1119 STATX_ATTR_IMMUTABLE | 1120 STATX_ATTR_NODUMP); 1121 generic_fillattr(idmap, request_mask, inode, stat); 1122 1123 if (shmem_is_huge(inode, 0, false, NULL, 0)) 1124 stat->blksize = HPAGE_PMD_SIZE; 1125 1126 if (request_mask & STATX_BTIME) { 1127 stat->result_mask |= STATX_BTIME; 1128 stat->btime.tv_sec = info->i_crtime.tv_sec; 1129 stat->btime.tv_nsec = info->i_crtime.tv_nsec; 1130 } 1131 1132 return 0; 1133 } 1134 1135 static int shmem_setattr(struct mnt_idmap *idmap, 1136 struct dentry *dentry, struct iattr *attr) 1137 { 1138 struct inode *inode = d_inode(dentry); 1139 struct shmem_inode_info *info = SHMEM_I(inode); 1140 int error; 1141 bool update_mtime = false; 1142 bool update_ctime = true; 1143 1144 error = setattr_prepare(idmap, dentry, attr); 1145 if (error) 1146 return error; 1147 1148 if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { 1149 if ((inode->i_mode ^ attr->ia_mode) & 0111) { 1150 return -EPERM; 1151 } 1152 } 1153 1154 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 1155 loff_t oldsize = inode->i_size; 1156 loff_t newsize = attr->ia_size; 1157 1158 /* protected by i_rwsem */ 1159 if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || 1160 (newsize > oldsize && (info->seals & F_SEAL_GROW))) 1161 return -EPERM; 1162 1163 if (newsize != oldsize) { 1164 error = shmem_reacct_size(SHMEM_I(inode)->flags, 1165 oldsize, newsize); 1166 if (error) 1167 return error; 1168 i_size_write(inode, newsize); 1169 update_mtime = true; 1170 } else { 1171 update_ctime = false; 1172 } 1173 if (newsize <= oldsize) { 1174 loff_t holebegin = round_up(newsize, PAGE_SIZE); 1175 if (oldsize > holebegin) 1176 unmap_mapping_range(inode->i_mapping, 1177 holebegin, 0, 1); 1178 if (info->alloced) 1179 shmem_truncate_range(inode, 1180 newsize, (loff_t)-1); 1181 /* unmap again to remove racily COWed private pages */ 1182 if (oldsize > holebegin) 1183 unmap_mapping_range(inode->i_mapping, 1184 holebegin, 0, 1); 1185 } 1186 } 1187 1188 if (is_quota_modification(idmap, inode, attr)) { 1189 error = dquot_initialize(inode); 1190 if (error) 1191 return error; 1192 } 1193 1194 /* Transfer quota accounting */ 1195 if (i_uid_needs_update(idmap, attr, inode) || 1196 i_gid_needs_update(idmap, attr, inode)) { 1197 error = dquot_transfer(idmap, inode, attr); 1198 if (error) 1199 return error; 1200 } 1201 1202 setattr_copy(idmap, inode, attr); 1203 if (attr->ia_valid & ATTR_MODE) 1204 error = posix_acl_chmod(idmap, dentry, inode->i_mode); 1205 if (!error && update_ctime) { 1206 inode_set_ctime_current(inode); 1207 if (update_mtime) 1208 inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); 1209 inode_inc_iversion(inode); 1210 } 1211 return error; 1212 } 1213 1214 static void shmem_evict_inode(struct inode *inode) 1215 { 1216 struct shmem_inode_info *info = SHMEM_I(inode); 1217 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1218 size_t freed = 0; 1219 1220 if (shmem_mapping(inode->i_mapping)) { 1221 shmem_unacct_size(info->flags, inode->i_size); 1222 inode->i_size = 0; 1223 mapping_set_exiting(inode->i_mapping); 1224 shmem_truncate_range(inode, 0, (loff_t)-1); 1225 if (!list_empty(&info->shrinklist)) { 1226 spin_lock(&sbinfo->shrinklist_lock); 1227 if (!list_empty(&info->shrinklist)) { 1228 list_del_init(&info->shrinklist); 1229 sbinfo->shrinklist_len--; 1230 } 1231 spin_unlock(&sbinfo->shrinklist_lock); 1232 } 1233 while (!list_empty(&info->swaplist)) { 1234 /* Wait while shmem_unuse() is scanning this inode... */ 1235 wait_var_event(&info->stop_eviction, 1236 !atomic_read(&info->stop_eviction)); 1237 mutex_lock(&shmem_swaplist_mutex); 1238 /* ...but beware of the race if we peeked too early */ 1239 if (!atomic_read(&info->stop_eviction)) 1240 list_del_init(&info->swaplist); 1241 mutex_unlock(&shmem_swaplist_mutex); 1242 } 1243 } 1244 1245 simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); 1246 shmem_free_inode(inode->i_sb, freed); 1247 WARN_ON(inode->i_blocks); 1248 clear_inode(inode); 1249 #ifdef CONFIG_TMPFS_QUOTA 1250 dquot_free_inode(inode); 1251 dquot_drop(inode); 1252 #endif 1253 } 1254 1255 static int shmem_find_swap_entries(struct address_space *mapping, 1256 pgoff_t start, struct folio_batch *fbatch, 1257 pgoff_t *indices, unsigned int type) 1258 { 1259 XA_STATE(xas, &mapping->i_pages, start); 1260 struct folio *folio; 1261 swp_entry_t entry; 1262 1263 rcu_read_lock(); 1264 xas_for_each(&xas, folio, ULONG_MAX) { 1265 if (xas_retry(&xas, folio)) 1266 continue; 1267 1268 if (!xa_is_value(folio)) 1269 continue; 1270 1271 entry = radix_to_swp_entry(folio); 1272 /* 1273 * swapin error entries can be found in the mapping. But they're 1274 * deliberately ignored here as we've done everything we can do. 1275 */ 1276 if (swp_type(entry) != type) 1277 continue; 1278 1279 indices[folio_batch_count(fbatch)] = xas.xa_index; 1280 if (!folio_batch_add(fbatch, folio)) 1281 break; 1282 1283 if (need_resched()) { 1284 xas_pause(&xas); 1285 cond_resched_rcu(); 1286 } 1287 } 1288 rcu_read_unlock(); 1289 1290 return xas.xa_index; 1291 } 1292 1293 /* 1294 * Move the swapped pages for an inode to page cache. Returns the count 1295 * of pages swapped in, or the error in case of failure. 1296 */ 1297 static int shmem_unuse_swap_entries(struct inode *inode, 1298 struct folio_batch *fbatch, pgoff_t *indices) 1299 { 1300 int i = 0; 1301 int ret = 0; 1302 int error = 0; 1303 struct address_space *mapping = inode->i_mapping; 1304 1305 for (i = 0; i < folio_batch_count(fbatch); i++) { 1306 struct folio *folio = fbatch->folios[i]; 1307 1308 if (!xa_is_value(folio)) 1309 continue; 1310 error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, 1311 mapping_gfp_mask(mapping), NULL, NULL); 1312 if (error == 0) { 1313 folio_unlock(folio); 1314 folio_put(folio); 1315 ret++; 1316 } 1317 if (error == -ENOMEM) 1318 break; 1319 error = 0; 1320 } 1321 return error ? error : ret; 1322 } 1323 1324 /* 1325 * If swap found in inode, free it and move page from swapcache to filecache. 1326 */ 1327 static int shmem_unuse_inode(struct inode *inode, unsigned int type) 1328 { 1329 struct address_space *mapping = inode->i_mapping; 1330 pgoff_t start = 0; 1331 struct folio_batch fbatch; 1332 pgoff_t indices[PAGEVEC_SIZE]; 1333 int ret = 0; 1334 1335 do { 1336 folio_batch_init(&fbatch); 1337 shmem_find_swap_entries(mapping, start, &fbatch, indices, type); 1338 if (folio_batch_count(&fbatch) == 0) { 1339 ret = 0; 1340 break; 1341 } 1342 1343 ret = shmem_unuse_swap_entries(inode, &fbatch, indices); 1344 if (ret < 0) 1345 break; 1346 1347 start = indices[folio_batch_count(&fbatch) - 1]; 1348 } while (true); 1349 1350 return ret; 1351 } 1352 1353 /* 1354 * Read all the shared memory data that resides in the swap 1355 * device 'type' back into memory, so the swap device can be 1356 * unused. 1357 */ 1358 int shmem_unuse(unsigned int type) 1359 { 1360 struct shmem_inode_info *info, *next; 1361 int error = 0; 1362 1363 if (list_empty(&shmem_swaplist)) 1364 return 0; 1365 1366 mutex_lock(&shmem_swaplist_mutex); 1367 list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { 1368 if (!info->swapped) { 1369 list_del_init(&info->swaplist); 1370 continue; 1371 } 1372 /* 1373 * Drop the swaplist mutex while searching the inode for swap; 1374 * but before doing so, make sure shmem_evict_inode() will not 1375 * remove placeholder inode from swaplist, nor let it be freed 1376 * (igrab() would protect from unlink, but not from unmount). 1377 */ 1378 atomic_inc(&info->stop_eviction); 1379 mutex_unlock(&shmem_swaplist_mutex); 1380 1381 error = shmem_unuse_inode(&info->vfs_inode, type); 1382 cond_resched(); 1383 1384 mutex_lock(&shmem_swaplist_mutex); 1385 next = list_next_entry(info, swaplist); 1386 if (!info->swapped) 1387 list_del_init(&info->swaplist); 1388 if (atomic_dec_and_test(&info->stop_eviction)) 1389 wake_up_var(&info->stop_eviction); 1390 if (error) 1391 break; 1392 } 1393 mutex_unlock(&shmem_swaplist_mutex); 1394 1395 return error; 1396 } 1397 1398 /* 1399 * Move the page from the page cache to the swap cache. 1400 */ 1401 static int shmem_writepage(struct page *page, struct writeback_control *wbc) 1402 { 1403 struct folio *folio = page_folio(page); 1404 struct address_space *mapping = folio->mapping; 1405 struct inode *inode = mapping->host; 1406 struct shmem_inode_info *info = SHMEM_I(inode); 1407 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1408 swp_entry_t swap; 1409 pgoff_t index; 1410 1411 /* 1412 * Our capabilities prevent regular writeback or sync from ever calling 1413 * shmem_writepage; but a stacking filesystem might use ->writepage of 1414 * its underlying filesystem, in which case tmpfs should write out to 1415 * swap only in response to memory pressure, and not for the writeback 1416 * threads or sync. 1417 */ 1418 if (WARN_ON_ONCE(!wbc->for_reclaim)) 1419 goto redirty; 1420 1421 if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap)) 1422 goto redirty; 1423 1424 if (!total_swap_pages) 1425 goto redirty; 1426 1427 /* 1428 * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or 1429 * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, 1430 * and its shmem_writeback() needs them to be split when swapping. 1431 */ 1432 if (folio_test_large(folio)) { 1433 /* Ensure the subpages are still dirty */ 1434 folio_test_set_dirty(folio); 1435 if (split_huge_page(page) < 0) 1436 goto redirty; 1437 folio = page_folio(page); 1438 folio_clear_dirty(folio); 1439 } 1440 1441 index = folio->index; 1442 1443 /* 1444 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC 1445 * value into swapfile.c, the only way we can correctly account for a 1446 * fallocated folio arriving here is now to initialize it and write it. 1447 * 1448 * That's okay for a folio already fallocated earlier, but if we have 1449 * not yet completed the fallocation, then (a) we want to keep track 1450 * of this folio in case we have to undo it, and (b) it may not be a 1451 * good idea to continue anyway, once we're pushing into swap. So 1452 * reactivate the folio, and let shmem_fallocate() quit when too many. 1453 */ 1454 if (!folio_test_uptodate(folio)) { 1455 if (inode->i_private) { 1456 struct shmem_falloc *shmem_falloc; 1457 spin_lock(&inode->i_lock); 1458 shmem_falloc = inode->i_private; 1459 if (shmem_falloc && 1460 !shmem_falloc->waitq && 1461 index >= shmem_falloc->start && 1462 index < shmem_falloc->next) 1463 shmem_falloc->nr_unswapped++; 1464 else 1465 shmem_falloc = NULL; 1466 spin_unlock(&inode->i_lock); 1467 if (shmem_falloc) 1468 goto redirty; 1469 } 1470 folio_zero_range(folio, 0, folio_size(folio)); 1471 flush_dcache_folio(folio); 1472 folio_mark_uptodate(folio); 1473 } 1474 1475 swap = folio_alloc_swap(folio); 1476 if (!swap.val) 1477 goto redirty; 1478 1479 /* 1480 * Add inode to shmem_unuse()'s list of swapped-out inodes, 1481 * if it's not already there. Do it now before the folio is 1482 * moved to swap cache, when its pagelock no longer protects 1483 * the inode from eviction. But don't unlock the mutex until 1484 * we've incremented swapped, because shmem_unuse_inode() will 1485 * prune a !swapped inode from the swaplist under this mutex. 1486 */ 1487 mutex_lock(&shmem_swaplist_mutex); 1488 if (list_empty(&info->swaplist)) 1489 list_add(&info->swaplist, &shmem_swaplist); 1490 1491 if (add_to_swap_cache(folio, swap, 1492 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, 1493 NULL) == 0) { 1494 shmem_recalc_inode(inode, 0, 1); 1495 swap_shmem_alloc(swap); 1496 shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); 1497 1498 mutex_unlock(&shmem_swaplist_mutex); 1499 BUG_ON(folio_mapped(folio)); 1500 swap_writepage(&folio->page, wbc); 1501 return 0; 1502 } 1503 1504 mutex_unlock(&shmem_swaplist_mutex); 1505 put_swap_folio(folio, swap); 1506 redirty: 1507 folio_mark_dirty(folio); 1508 if (wbc->for_reclaim) 1509 return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ 1510 folio_unlock(folio); 1511 return 0; 1512 } 1513 1514 #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) 1515 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1516 { 1517 char buffer[64]; 1518 1519 if (!mpol || mpol->mode == MPOL_DEFAULT) 1520 return; /* show nothing */ 1521 1522 mpol_to_str(buffer, sizeof(buffer), mpol); 1523 1524 seq_printf(seq, ",mpol=%s", buffer); 1525 } 1526 1527 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1528 { 1529 struct mempolicy *mpol = NULL; 1530 if (sbinfo->mpol) { 1531 raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 1532 mpol = sbinfo->mpol; 1533 mpol_get(mpol); 1534 raw_spin_unlock(&sbinfo->stat_lock); 1535 } 1536 return mpol; 1537 } 1538 #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ 1539 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1540 { 1541 } 1542 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1543 { 1544 return NULL; 1545 } 1546 #endif /* CONFIG_NUMA && CONFIG_TMPFS */ 1547 1548 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, 1549 pgoff_t index, unsigned int order, pgoff_t *ilx); 1550 1551 static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, 1552 struct shmem_inode_info *info, pgoff_t index) 1553 { 1554 struct mempolicy *mpol; 1555 pgoff_t ilx; 1556 struct page *page; 1557 1558 mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); 1559 page = swap_cluster_readahead(swap, gfp, mpol, ilx); 1560 mpol_cond_put(mpol); 1561 1562 if (!page) 1563 return NULL; 1564 return page_folio(page); 1565 } 1566 1567 /* 1568 * Make sure huge_gfp is always more limited than limit_gfp. 1569 * Some of the flags set permissions, while others set limitations. 1570 */ 1571 static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) 1572 { 1573 gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; 1574 gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; 1575 gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; 1576 gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); 1577 1578 /* Allow allocations only from the originally specified zones. */ 1579 result |= zoneflags; 1580 1581 /* 1582 * Minimize the result gfp by taking the union with the deny flags, 1583 * and the intersection of the allow flags. 1584 */ 1585 result |= (limit_gfp & denyflags); 1586 result |= (huge_gfp & limit_gfp) & allowflags; 1587 1588 return result; 1589 } 1590 1591 static struct folio *shmem_alloc_hugefolio(gfp_t gfp, 1592 struct shmem_inode_info *info, pgoff_t index) 1593 { 1594 struct mempolicy *mpol; 1595 pgoff_t ilx; 1596 struct page *page; 1597 1598 mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx); 1599 page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id()); 1600 mpol_cond_put(mpol); 1601 1602 return page_rmappable_folio(page); 1603 } 1604 1605 static struct folio *shmem_alloc_folio(gfp_t gfp, 1606 struct shmem_inode_info *info, pgoff_t index) 1607 { 1608 struct mempolicy *mpol; 1609 pgoff_t ilx; 1610 struct page *page; 1611 1612 mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); 1613 page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id()); 1614 mpol_cond_put(mpol); 1615 1616 return (struct folio *)page; 1617 } 1618 1619 static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, 1620 struct inode *inode, pgoff_t index, 1621 struct mm_struct *fault_mm, bool huge) 1622 { 1623 struct address_space *mapping = inode->i_mapping; 1624 struct shmem_inode_info *info = SHMEM_I(inode); 1625 struct folio *folio; 1626 long pages; 1627 int error; 1628 1629 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1630 huge = false; 1631 1632 if (huge) { 1633 pages = HPAGE_PMD_NR; 1634 index = round_down(index, HPAGE_PMD_NR); 1635 1636 /* 1637 * Check for conflict before waiting on a huge allocation. 1638 * Conflict might be that a huge page has just been allocated 1639 * and added to page cache by a racing thread, or that there 1640 * is already at least one small page in the huge extent. 1641 * Be careful to retry when appropriate, but not forever! 1642 * Elsewhere -EEXIST would be the right code, but not here. 1643 */ 1644 if (xa_find(&mapping->i_pages, &index, 1645 index + HPAGE_PMD_NR - 1, XA_PRESENT)) 1646 return ERR_PTR(-E2BIG); 1647 1648 folio = shmem_alloc_hugefolio(gfp, info, index); 1649 if (!folio) 1650 count_vm_event(THP_FILE_FALLBACK); 1651 } else { 1652 pages = 1; 1653 folio = shmem_alloc_folio(gfp, info, index); 1654 } 1655 if (!folio) 1656 return ERR_PTR(-ENOMEM); 1657 1658 __folio_set_locked(folio); 1659 __folio_set_swapbacked(folio); 1660 1661 gfp &= GFP_RECLAIM_MASK; 1662 error = mem_cgroup_charge(folio, fault_mm, gfp); 1663 if (error) { 1664 if (xa_find(&mapping->i_pages, &index, 1665 index + pages - 1, XA_PRESENT)) { 1666 error = -EEXIST; 1667 } else if (huge) { 1668 count_vm_event(THP_FILE_FALLBACK); 1669 count_vm_event(THP_FILE_FALLBACK_CHARGE); 1670 } 1671 goto unlock; 1672 } 1673 1674 error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp); 1675 if (error) 1676 goto unlock; 1677 1678 error = shmem_inode_acct_blocks(inode, pages); 1679 if (error) { 1680 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1681 long freed; 1682 /* 1683 * Try to reclaim some space by splitting a few 1684 * large folios beyond i_size on the filesystem. 1685 */ 1686 shmem_unused_huge_shrink(sbinfo, NULL, 2); 1687 /* 1688 * And do a shmem_recalc_inode() to account for freed pages: 1689 * except our folio is there in cache, so not quite balanced. 1690 */ 1691 spin_lock(&info->lock); 1692 freed = pages + info->alloced - info->swapped - 1693 READ_ONCE(mapping->nrpages); 1694 if (freed > 0) 1695 info->alloced -= freed; 1696 spin_unlock(&info->lock); 1697 if (freed > 0) 1698 shmem_inode_unacct_blocks(inode, freed); 1699 error = shmem_inode_acct_blocks(inode, pages); 1700 if (error) { 1701 filemap_remove_folio(folio); 1702 goto unlock; 1703 } 1704 } 1705 1706 shmem_recalc_inode(inode, pages, 0); 1707 folio_add_lru(folio); 1708 return folio; 1709 1710 unlock: 1711 folio_unlock(folio); 1712 folio_put(folio); 1713 return ERR_PTR(error); 1714 } 1715 1716 /* 1717 * When a page is moved from swapcache to shmem filecache (either by the 1718 * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of 1719 * shmem_unuse_inode()), it may have been read in earlier from swap, in 1720 * ignorance of the mapping it belongs to. If that mapping has special 1721 * constraints (like the gma500 GEM driver, which requires RAM below 4GB), 1722 * we may need to copy to a suitable page before moving to filecache. 1723 * 1724 * In a future release, this may well be extended to respect cpuset and 1725 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); 1726 * but for now it is a simple matter of zone. 1727 */ 1728 static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) 1729 { 1730 return folio_zonenum(folio) > gfp_zone(gfp); 1731 } 1732 1733 static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, 1734 struct shmem_inode_info *info, pgoff_t index) 1735 { 1736 struct folio *old, *new; 1737 struct address_space *swap_mapping; 1738 swp_entry_t entry; 1739 pgoff_t swap_index; 1740 int error; 1741 1742 old = *foliop; 1743 entry = old->swap; 1744 swap_index = swp_offset(entry); 1745 swap_mapping = swap_address_space(entry); 1746 1747 /* 1748 * We have arrived here because our zones are constrained, so don't 1749 * limit chance of success by further cpuset and node constraints. 1750 */ 1751 gfp &= ~GFP_CONSTRAINT_MASK; 1752 VM_BUG_ON_FOLIO(folio_test_large(old), old); 1753 new = shmem_alloc_folio(gfp, info, index); 1754 if (!new) 1755 return -ENOMEM; 1756 1757 folio_get(new); 1758 folio_copy(new, old); 1759 flush_dcache_folio(new); 1760 1761 __folio_set_locked(new); 1762 __folio_set_swapbacked(new); 1763 folio_mark_uptodate(new); 1764 new->swap = entry; 1765 folio_set_swapcache(new); 1766 1767 /* 1768 * Our caller will very soon move newpage out of swapcache, but it's 1769 * a nice clean interface for us to replace oldpage by newpage there. 1770 */ 1771 xa_lock_irq(&swap_mapping->i_pages); 1772 error = shmem_replace_entry(swap_mapping, swap_index, old, new); 1773 if (!error) { 1774 mem_cgroup_migrate(old, new); 1775 __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); 1776 __lruvec_stat_mod_folio(new, NR_SHMEM, 1); 1777 __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); 1778 __lruvec_stat_mod_folio(old, NR_SHMEM, -1); 1779 } 1780 xa_unlock_irq(&swap_mapping->i_pages); 1781 1782 if (unlikely(error)) { 1783 /* 1784 * Is this possible? I think not, now that our callers check 1785 * both PageSwapCache and page_private after getting page lock; 1786 * but be defensive. Reverse old to newpage for clear and free. 1787 */ 1788 old = new; 1789 } else { 1790 folio_add_lru(new); 1791 *foliop = new; 1792 } 1793 1794 folio_clear_swapcache(old); 1795 old->private = NULL; 1796 1797 folio_unlock(old); 1798 folio_put_refs(old, 2); 1799 return error; 1800 } 1801 1802 static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, 1803 struct folio *folio, swp_entry_t swap) 1804 { 1805 struct address_space *mapping = inode->i_mapping; 1806 swp_entry_t swapin_error; 1807 void *old; 1808 1809 swapin_error = make_poisoned_swp_entry(); 1810 old = xa_cmpxchg_irq(&mapping->i_pages, index, 1811 swp_to_radix_entry(swap), 1812 swp_to_radix_entry(swapin_error), 0); 1813 if (old != swp_to_radix_entry(swap)) 1814 return; 1815 1816 folio_wait_writeback(folio); 1817 delete_from_swap_cache(folio); 1818 /* 1819 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks 1820 * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) 1821 * in shmem_evict_inode(). 1822 */ 1823 shmem_recalc_inode(inode, -1, -1); 1824 swap_free(swap); 1825 } 1826 1827 /* 1828 * Swap in the folio pointed to by *foliop. 1829 * Caller has to make sure that *foliop contains a valid swapped folio. 1830 * Returns 0 and the folio in foliop if success. On failure, returns the 1831 * error code and NULL in *foliop. 1832 */ 1833 static int shmem_swapin_folio(struct inode *inode, pgoff_t index, 1834 struct folio **foliop, enum sgp_type sgp, 1835 gfp_t gfp, struct mm_struct *fault_mm, 1836 vm_fault_t *fault_type) 1837 { 1838 struct address_space *mapping = inode->i_mapping; 1839 struct shmem_inode_info *info = SHMEM_I(inode); 1840 struct swap_info_struct *si; 1841 struct folio *folio = NULL; 1842 swp_entry_t swap; 1843 int error; 1844 1845 VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); 1846 swap = radix_to_swp_entry(*foliop); 1847 *foliop = NULL; 1848 1849 if (is_poisoned_swp_entry(swap)) 1850 return -EIO; 1851 1852 si = get_swap_device(swap); 1853 if (!si) { 1854 if (!shmem_confirm_swap(mapping, index, swap)) 1855 return -EEXIST; 1856 else 1857 return -EINVAL; 1858 } 1859 1860 /* Look it up and read it in.. */ 1861 folio = swap_cache_get_folio(swap, NULL, 0); 1862 if (!folio) { 1863 /* Or update major stats only when swapin succeeds?? */ 1864 if (fault_type) { 1865 *fault_type |= VM_FAULT_MAJOR; 1866 count_vm_event(PGMAJFAULT); 1867 count_memcg_event_mm(fault_mm, PGMAJFAULT); 1868 } 1869 /* Here we actually start the io */ 1870 folio = shmem_swapin_cluster(swap, gfp, info, index); 1871 if (!folio) { 1872 error = -ENOMEM; 1873 goto failed; 1874 } 1875 } 1876 1877 /* We have to do this with folio locked to prevent races */ 1878 folio_lock(folio); 1879 if (!folio_test_swapcache(folio) || 1880 folio->swap.val != swap.val || 1881 !shmem_confirm_swap(mapping, index, swap)) { 1882 error = -EEXIST; 1883 goto unlock; 1884 } 1885 if (!folio_test_uptodate(folio)) { 1886 error = -EIO; 1887 goto failed; 1888 } 1889 folio_wait_writeback(folio); 1890 1891 /* 1892 * Some architectures may have to restore extra metadata to the 1893 * folio after reading from swap. 1894 */ 1895 arch_swap_restore(swap, folio); 1896 1897 if (shmem_should_replace_folio(folio, gfp)) { 1898 error = shmem_replace_folio(&folio, gfp, info, index); 1899 if (error) 1900 goto failed; 1901 } 1902 1903 error = shmem_add_to_page_cache(folio, mapping, index, 1904 swp_to_radix_entry(swap), gfp); 1905 if (error) 1906 goto failed; 1907 1908 shmem_recalc_inode(inode, 0, -1); 1909 1910 if (sgp == SGP_WRITE) 1911 folio_mark_accessed(folio); 1912 1913 delete_from_swap_cache(folio); 1914 folio_mark_dirty(folio); 1915 swap_free(swap); 1916 put_swap_device(si); 1917 1918 *foliop = folio; 1919 return 0; 1920 failed: 1921 if (!shmem_confirm_swap(mapping, index, swap)) 1922 error = -EEXIST; 1923 if (error == -EIO) 1924 shmem_set_folio_swapin_error(inode, index, folio, swap); 1925 unlock: 1926 if (folio) { 1927 folio_unlock(folio); 1928 folio_put(folio); 1929 } 1930 put_swap_device(si); 1931 1932 return error; 1933 } 1934 1935 /* 1936 * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate 1937 * 1938 * If we allocate a new one we do not mark it dirty. That's up to the 1939 * vm. If we swap it in we mark it dirty since we also free the swap 1940 * entry since a page cannot live in both the swap and page cache. 1941 * 1942 * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. 1943 */ 1944 static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, 1945 struct folio **foliop, enum sgp_type sgp, gfp_t gfp, 1946 struct vm_fault *vmf, vm_fault_t *fault_type) 1947 { 1948 struct vm_area_struct *vma = vmf ? vmf->vma : NULL; 1949 struct mm_struct *fault_mm; 1950 struct folio *folio; 1951 int error; 1952 bool alloced; 1953 1954 if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) 1955 return -EFBIG; 1956 repeat: 1957 if (sgp <= SGP_CACHE && 1958 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) 1959 return -EINVAL; 1960 1961 alloced = false; 1962 fault_mm = vma ? vma->vm_mm : NULL; 1963 1964 folio = filemap_get_entry(inode->i_mapping, index); 1965 if (folio && vma && userfaultfd_minor(vma)) { 1966 if (!xa_is_value(folio)) 1967 folio_put(folio); 1968 *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); 1969 return 0; 1970 } 1971 1972 if (xa_is_value(folio)) { 1973 error = shmem_swapin_folio(inode, index, &folio, 1974 sgp, gfp, fault_mm, fault_type); 1975 if (error == -EEXIST) 1976 goto repeat; 1977 1978 *foliop = folio; 1979 return error; 1980 } 1981 1982 if (folio) { 1983 folio_lock(folio); 1984 1985 /* Has the folio been truncated or swapped out? */ 1986 if (unlikely(folio->mapping != inode->i_mapping)) { 1987 folio_unlock(folio); 1988 folio_put(folio); 1989 goto repeat; 1990 } 1991 if (sgp == SGP_WRITE) 1992 folio_mark_accessed(folio); 1993 if (folio_test_uptodate(folio)) 1994 goto out; 1995 /* fallocated folio */ 1996 if (sgp != SGP_READ) 1997 goto clear; 1998 folio_unlock(folio); 1999 folio_put(folio); 2000 } 2001 2002 /* 2003 * SGP_READ: succeed on hole, with NULL folio, letting caller zero. 2004 * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. 2005 */ 2006 *foliop = NULL; 2007 if (sgp == SGP_READ) 2008 return 0; 2009 if (sgp == SGP_NOALLOC) 2010 return -ENOENT; 2011 2012 /* 2013 * Fast cache lookup and swap lookup did not find it: allocate. 2014 */ 2015 2016 if (vma && userfaultfd_missing(vma)) { 2017 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); 2018 return 0; 2019 } 2020 2021 if (shmem_is_huge(inode, index, false, fault_mm, 2022 vma ? vma->vm_flags : 0)) { 2023 gfp_t huge_gfp; 2024 2025 huge_gfp = vma_thp_gfp_mask(vma); 2026 huge_gfp = limit_gfp_mask(huge_gfp, gfp); 2027 folio = shmem_alloc_and_add_folio(huge_gfp, 2028 inode, index, fault_mm, true); 2029 if (!IS_ERR(folio)) { 2030 count_vm_event(THP_FILE_ALLOC); 2031 goto alloced; 2032 } 2033 if (PTR_ERR(folio) == -EEXIST) 2034 goto repeat; 2035 } 2036 2037 folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false); 2038 if (IS_ERR(folio)) { 2039 error = PTR_ERR(folio); 2040 if (error == -EEXIST) 2041 goto repeat; 2042 folio = NULL; 2043 goto unlock; 2044 } 2045 2046 alloced: 2047 alloced = true; 2048 if (folio_test_pmd_mappable(folio) && 2049 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < 2050 folio_next_index(folio) - 1) { 2051 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 2052 struct shmem_inode_info *info = SHMEM_I(inode); 2053 /* 2054 * Part of the large folio is beyond i_size: subject 2055 * to shrink under memory pressure. 2056 */ 2057 spin_lock(&sbinfo->shrinklist_lock); 2058 /* 2059 * _careful to defend against unlocked access to 2060 * ->shrink_list in shmem_unused_huge_shrink() 2061 */ 2062 if (list_empty_careful(&info->shrinklist)) { 2063 list_add_tail(&info->shrinklist, 2064 &sbinfo->shrinklist); 2065 sbinfo->shrinklist_len++; 2066 } 2067 spin_unlock(&sbinfo->shrinklist_lock); 2068 } 2069 2070 if (sgp == SGP_WRITE) 2071 folio_set_referenced(folio); 2072 /* 2073 * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. 2074 */ 2075 if (sgp == SGP_FALLOC) 2076 sgp = SGP_WRITE; 2077 clear: 2078 /* 2079 * Let SGP_WRITE caller clear ends if write does not fill folio; 2080 * but SGP_FALLOC on a folio fallocated earlier must initialize 2081 * it now, lest undo on failure cancel our earlier guarantee. 2082 */ 2083 if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { 2084 long i, n = folio_nr_pages(folio); 2085 2086 for (i = 0; i < n; i++) 2087 clear_highpage(folio_page(folio, i)); 2088 flush_dcache_folio(folio); 2089 folio_mark_uptodate(folio); 2090 } 2091 2092 /* Perhaps the file has been truncated since we checked */ 2093 if (sgp <= SGP_CACHE && 2094 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 2095 error = -EINVAL; 2096 goto unlock; 2097 } 2098 out: 2099 *foliop = folio; 2100 return 0; 2101 2102 /* 2103 * Error recovery. 2104 */ 2105 unlock: 2106 if (alloced) 2107 filemap_remove_folio(folio); 2108 shmem_recalc_inode(inode, 0, 0); 2109 if (folio) { 2110 folio_unlock(folio); 2111 folio_put(folio); 2112 } 2113 return error; 2114 } 2115 2116 int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, 2117 enum sgp_type sgp) 2118 { 2119 return shmem_get_folio_gfp(inode, index, foliop, sgp, 2120 mapping_gfp_mask(inode->i_mapping), NULL, NULL); 2121 } 2122 2123 /* 2124 * This is like autoremove_wake_function, but it removes the wait queue 2125 * entry unconditionally - even if something else had already woken the 2126 * target. 2127 */ 2128 static int synchronous_wake_function(wait_queue_entry_t *wait, 2129 unsigned int mode, int sync, void *key) 2130 { 2131 int ret = default_wake_function(wait, mode, sync, key); 2132 list_del_init(&wait->entry); 2133 return ret; 2134 } 2135 2136 /* 2137 * Trinity finds that probing a hole which tmpfs is punching can 2138 * prevent the hole-punch from ever completing: which in turn 2139 * locks writers out with its hold on i_rwsem. So refrain from 2140 * faulting pages into the hole while it's being punched. Although 2141 * shmem_undo_range() does remove the additions, it may be unable to 2142 * keep up, as each new page needs its own unmap_mapping_range() call, 2143 * and the i_mmap tree grows ever slower to scan if new vmas are added. 2144 * 2145 * It does not matter if we sometimes reach this check just before the 2146 * hole-punch begins, so that one fault then races with the punch: 2147 * we just need to make racing faults a rare case. 2148 * 2149 * The implementation below would be much simpler if we just used a 2150 * standard mutex or completion: but we cannot take i_rwsem in fault, 2151 * and bloating every shmem inode for this unlikely case would be sad. 2152 */ 2153 static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) 2154 { 2155 struct shmem_falloc *shmem_falloc; 2156 struct file *fpin = NULL; 2157 vm_fault_t ret = 0; 2158 2159 spin_lock(&inode->i_lock); 2160 shmem_falloc = inode->i_private; 2161 if (shmem_falloc && 2162 shmem_falloc->waitq && 2163 vmf->pgoff >= shmem_falloc->start && 2164 vmf->pgoff < shmem_falloc->next) { 2165 wait_queue_head_t *shmem_falloc_waitq; 2166 DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); 2167 2168 ret = VM_FAULT_NOPAGE; 2169 fpin = maybe_unlock_mmap_for_io(vmf, NULL); 2170 shmem_falloc_waitq = shmem_falloc->waitq; 2171 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, 2172 TASK_UNINTERRUPTIBLE); 2173 spin_unlock(&inode->i_lock); 2174 schedule(); 2175 2176 /* 2177 * shmem_falloc_waitq points into the shmem_fallocate() 2178 * stack of the hole-punching task: shmem_falloc_waitq 2179 * is usually invalid by the time we reach here, but 2180 * finish_wait() does not dereference it in that case; 2181 * though i_lock needed lest racing with wake_up_all(). 2182 */ 2183 spin_lock(&inode->i_lock); 2184 finish_wait(shmem_falloc_waitq, &shmem_fault_wait); 2185 } 2186 spin_unlock(&inode->i_lock); 2187 if (fpin) { 2188 fput(fpin); 2189 ret = VM_FAULT_RETRY; 2190 } 2191 return ret; 2192 } 2193 2194 static vm_fault_t shmem_fault(struct vm_fault *vmf) 2195 { 2196 struct inode *inode = file_inode(vmf->vma->vm_file); 2197 gfp_t gfp = mapping_gfp_mask(inode->i_mapping); 2198 struct folio *folio = NULL; 2199 vm_fault_t ret = 0; 2200 int err; 2201 2202 /* 2203 * Trinity finds that probing a hole which tmpfs is punching can 2204 * prevent the hole-punch from ever completing: noted in i_private. 2205 */ 2206 if (unlikely(inode->i_private)) { 2207 ret = shmem_falloc_wait(vmf, inode); 2208 if (ret) 2209 return ret; 2210 } 2211 2212 WARN_ON_ONCE(vmf->page != NULL); 2213 err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, 2214 gfp, vmf, &ret); 2215 if (err) 2216 return vmf_error(err); 2217 if (folio) { 2218 vmf->page = folio_file_page(folio, vmf->pgoff); 2219 ret |= VM_FAULT_LOCKED; 2220 } 2221 return ret; 2222 } 2223 2224 unsigned long shmem_get_unmapped_area(struct file *file, 2225 unsigned long uaddr, unsigned long len, 2226 unsigned long pgoff, unsigned long flags) 2227 { 2228 unsigned long (*get_area)(struct file *, 2229 unsigned long, unsigned long, unsigned long, unsigned long); 2230 unsigned long addr; 2231 unsigned long offset; 2232 unsigned long inflated_len; 2233 unsigned long inflated_addr; 2234 unsigned long inflated_offset; 2235 2236 if (len > TASK_SIZE) 2237 return -ENOMEM; 2238 2239 get_area = current->mm->get_unmapped_area; 2240 addr = get_area(file, uaddr, len, pgoff, flags); 2241 2242 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 2243 return addr; 2244 if (IS_ERR_VALUE(addr)) 2245 return addr; 2246 if (addr & ~PAGE_MASK) 2247 return addr; 2248 if (addr > TASK_SIZE - len) 2249 return addr; 2250 2251 if (shmem_huge == SHMEM_HUGE_DENY) 2252 return addr; 2253 if (len < HPAGE_PMD_SIZE) 2254 return addr; 2255 if (flags & MAP_FIXED) 2256 return addr; 2257 /* 2258 * Our priority is to support MAP_SHARED mapped hugely; 2259 * and support MAP_PRIVATE mapped hugely too, until it is COWed. 2260 * But if caller specified an address hint and we allocated area there 2261 * successfully, respect that as before. 2262 */ 2263 if (uaddr == addr) 2264 return addr; 2265 2266 if (shmem_huge != SHMEM_HUGE_FORCE) { 2267 struct super_block *sb; 2268 2269 if (file) { 2270 VM_BUG_ON(file->f_op != &shmem_file_operations); 2271 sb = file_inode(file)->i_sb; 2272 } else { 2273 /* 2274 * Called directly from mm/mmap.c, or drivers/char/mem.c 2275 * for "/dev/zero", to create a shared anonymous object. 2276 */ 2277 if (IS_ERR(shm_mnt)) 2278 return addr; 2279 sb = shm_mnt->mnt_sb; 2280 } 2281 if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) 2282 return addr; 2283 } 2284 2285 offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); 2286 if (offset && offset + len < 2 * HPAGE_PMD_SIZE) 2287 return addr; 2288 if ((addr & (HPAGE_PMD_SIZE-1)) == offset) 2289 return addr; 2290 2291 inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; 2292 if (inflated_len > TASK_SIZE) 2293 return addr; 2294 if (inflated_len < len) 2295 return addr; 2296 2297 inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); 2298 if (IS_ERR_VALUE(inflated_addr)) 2299 return addr; 2300 if (inflated_addr & ~PAGE_MASK) 2301 return addr; 2302 2303 inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); 2304 inflated_addr += offset - inflated_offset; 2305 if (inflated_offset > offset) 2306 inflated_addr += HPAGE_PMD_SIZE; 2307 2308 if (inflated_addr > TASK_SIZE - len) 2309 return addr; 2310 return inflated_addr; 2311 } 2312 2313 #ifdef CONFIG_NUMA 2314 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 2315 { 2316 struct inode *inode = file_inode(vma->vm_file); 2317 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); 2318 } 2319 2320 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 2321 unsigned long addr, pgoff_t *ilx) 2322 { 2323 struct inode *inode = file_inode(vma->vm_file); 2324 pgoff_t index; 2325 2326 /* 2327 * Bias interleave by inode number to distribute better across nodes; 2328 * but this interface is independent of which page order is used, so 2329 * supplies only that bias, letting caller apply the offset (adjusted 2330 * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). 2331 */ 2332 *ilx = inode->i_ino; 2333 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2334 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); 2335 } 2336 2337 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, 2338 pgoff_t index, unsigned int order, pgoff_t *ilx) 2339 { 2340 struct mempolicy *mpol; 2341 2342 /* Bias interleave by inode number to distribute better across nodes */ 2343 *ilx = info->vfs_inode.i_ino + (index >> order); 2344 2345 mpol = mpol_shared_policy_lookup(&info->policy, index); 2346 return mpol ? mpol : get_task_policy(current); 2347 } 2348 #else 2349 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, 2350 pgoff_t index, unsigned int order, pgoff_t *ilx) 2351 { 2352 *ilx = 0; 2353 return NULL; 2354 } 2355 #endif /* CONFIG_NUMA */ 2356 2357 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 2358 { 2359 struct inode *inode = file_inode(file); 2360 struct shmem_inode_info *info = SHMEM_I(inode); 2361 int retval = -ENOMEM; 2362 2363 /* 2364 * What serializes the accesses to info->flags? 2365 * ipc_lock_object() when called from shmctl_do_lock(), 2366 * no serialization needed when called from shm_destroy(). 2367 */ 2368 if (lock && !(info->flags & VM_LOCKED)) { 2369 if (!user_shm_lock(inode->i_size, ucounts)) 2370 goto out_nomem; 2371 info->flags |= VM_LOCKED; 2372 mapping_set_unevictable(file->f_mapping); 2373 } 2374 if (!lock && (info->flags & VM_LOCKED) && ucounts) { 2375 user_shm_unlock(inode->i_size, ucounts); 2376 info->flags &= ~VM_LOCKED; 2377 mapping_clear_unevictable(file->f_mapping); 2378 } 2379 retval = 0; 2380 2381 out_nomem: 2382 return retval; 2383 } 2384 2385 static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 2386 { 2387 struct inode *inode = file_inode(file); 2388 struct shmem_inode_info *info = SHMEM_I(inode); 2389 int ret; 2390 2391 ret = seal_check_write(info->seals, vma); 2392 if (ret) 2393 return ret; 2394 2395 /* arm64 - allow memory tagging on RAM-based files */ 2396 vm_flags_set(vma, VM_MTE_ALLOWED); 2397 2398 file_accessed(file); 2399 /* This is anonymous shared memory if it is unlinked at the time of mmap */ 2400 if (inode->i_nlink) 2401 vma->vm_ops = &shmem_vm_ops; 2402 else 2403 vma->vm_ops = &shmem_anon_vm_ops; 2404 return 0; 2405 } 2406 2407 static int shmem_file_open(struct inode *inode, struct file *file) 2408 { 2409 file->f_mode |= FMODE_CAN_ODIRECT; 2410 return generic_file_open(inode, file); 2411 } 2412 2413 #ifdef CONFIG_TMPFS_XATTR 2414 static int shmem_initxattrs(struct inode *, const struct xattr *, void *); 2415 2416 /* 2417 * chattr's fsflags are unrelated to extended attributes, 2418 * but tmpfs has chosen to enable them under the same config option. 2419 */ 2420 static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) 2421 { 2422 unsigned int i_flags = 0; 2423 2424 if (fsflags & FS_NOATIME_FL) 2425 i_flags |= S_NOATIME; 2426 if (fsflags & FS_APPEND_FL) 2427 i_flags |= S_APPEND; 2428 if (fsflags & FS_IMMUTABLE_FL) 2429 i_flags |= S_IMMUTABLE; 2430 /* 2431 * But FS_NODUMP_FL does not require any action in i_flags. 2432 */ 2433 inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); 2434 } 2435 #else 2436 static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) 2437 { 2438 } 2439 #define shmem_initxattrs NULL 2440 #endif 2441 2442 static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) 2443 { 2444 return &SHMEM_I(inode)->dir_offsets; 2445 } 2446 2447 static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, 2448 struct super_block *sb, 2449 struct inode *dir, umode_t mode, 2450 dev_t dev, unsigned long flags) 2451 { 2452 struct inode *inode; 2453 struct shmem_inode_info *info; 2454 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2455 ino_t ino; 2456 int err; 2457 2458 err = shmem_reserve_inode(sb, &ino); 2459 if (err) 2460 return ERR_PTR(err); 2461 2462 inode = new_inode(sb); 2463 if (!inode) { 2464 shmem_free_inode(sb, 0); 2465 return ERR_PTR(-ENOSPC); 2466 } 2467 2468 inode->i_ino = ino; 2469 inode_init_owner(idmap, inode, dir, mode); 2470 inode->i_blocks = 0; 2471 simple_inode_init_ts(inode); 2472 inode->i_generation = get_random_u32(); 2473 info = SHMEM_I(inode); 2474 memset(info, 0, (char *)inode - (char *)info); 2475 spin_lock_init(&info->lock); 2476 atomic_set(&info->stop_eviction, 0); 2477 info->seals = F_SEAL_SEAL; 2478 info->flags = flags & VM_NORESERVE; 2479 info->i_crtime = inode_get_mtime(inode); 2480 info->fsflags = (dir == NULL) ? 0 : 2481 SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; 2482 if (info->fsflags) 2483 shmem_set_inode_flags(inode, info->fsflags); 2484 INIT_LIST_HEAD(&info->shrinklist); 2485 INIT_LIST_HEAD(&info->swaplist); 2486 simple_xattrs_init(&info->xattrs); 2487 cache_no_acl(inode); 2488 if (sbinfo->noswap) 2489 mapping_set_unevictable(inode->i_mapping); 2490 mapping_set_large_folios(inode->i_mapping); 2491 2492 switch (mode & S_IFMT) { 2493 default: 2494 inode->i_op = &shmem_special_inode_operations; 2495 init_special_inode(inode, mode, dev); 2496 break; 2497 case S_IFREG: 2498 inode->i_mapping->a_ops = &shmem_aops; 2499 inode->i_op = &shmem_inode_operations; 2500 inode->i_fop = &shmem_file_operations; 2501 mpol_shared_policy_init(&info->policy, 2502 shmem_get_sbmpol(sbinfo)); 2503 break; 2504 case S_IFDIR: 2505 inc_nlink(inode); 2506 /* Some things misbehave if size == 0 on a directory */ 2507 inode->i_size = 2 * BOGO_DIRENT_SIZE; 2508 inode->i_op = &shmem_dir_inode_operations; 2509 inode->i_fop = &simple_offset_dir_operations; 2510 simple_offset_init(shmem_get_offset_ctx(inode)); 2511 break; 2512 case S_IFLNK: 2513 /* 2514 * Must not load anything in the rbtree, 2515 * mpol_free_shared_policy will not be called. 2516 */ 2517 mpol_shared_policy_init(&info->policy, NULL); 2518 break; 2519 } 2520 2521 lockdep_annotate_inode_mutex_key(inode); 2522 return inode; 2523 } 2524 2525 #ifdef CONFIG_TMPFS_QUOTA 2526 static struct inode *shmem_get_inode(struct mnt_idmap *idmap, 2527 struct super_block *sb, struct inode *dir, 2528 umode_t mode, dev_t dev, unsigned long flags) 2529 { 2530 int err; 2531 struct inode *inode; 2532 2533 inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags); 2534 if (IS_ERR(inode)) 2535 return inode; 2536 2537 err = dquot_initialize(inode); 2538 if (err) 2539 goto errout; 2540 2541 err = dquot_alloc_inode(inode); 2542 if (err) { 2543 dquot_drop(inode); 2544 goto errout; 2545 } 2546 return inode; 2547 2548 errout: 2549 inode->i_flags |= S_NOQUOTA; 2550 iput(inode); 2551 return ERR_PTR(err); 2552 } 2553 #else 2554 static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, 2555 struct super_block *sb, struct inode *dir, 2556 umode_t mode, dev_t dev, unsigned long flags) 2557 { 2558 return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); 2559 } 2560 #endif /* CONFIG_TMPFS_QUOTA */ 2561 2562 #ifdef CONFIG_USERFAULTFD 2563 int shmem_mfill_atomic_pte(pmd_t *dst_pmd, 2564 struct vm_area_struct *dst_vma, 2565 unsigned long dst_addr, 2566 unsigned long src_addr, 2567 uffd_flags_t flags, 2568 struct folio **foliop) 2569 { 2570 struct inode *inode = file_inode(dst_vma->vm_file); 2571 struct shmem_inode_info *info = SHMEM_I(inode); 2572 struct address_space *mapping = inode->i_mapping; 2573 gfp_t gfp = mapping_gfp_mask(mapping); 2574 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 2575 void *page_kaddr; 2576 struct folio *folio; 2577 int ret; 2578 pgoff_t max_off; 2579 2580 if (shmem_inode_acct_blocks(inode, 1)) { 2581 /* 2582 * We may have got a page, returned -ENOENT triggering a retry, 2583 * and now we find ourselves with -ENOMEM. Release the page, to 2584 * avoid a BUG_ON in our caller. 2585 */ 2586 if (unlikely(*foliop)) { 2587 folio_put(*foliop); 2588 *foliop = NULL; 2589 } 2590 return -ENOMEM; 2591 } 2592 2593 if (!*foliop) { 2594 ret = -ENOMEM; 2595 folio = shmem_alloc_folio(gfp, info, pgoff); 2596 if (!folio) 2597 goto out_unacct_blocks; 2598 2599 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { 2600 page_kaddr = kmap_local_folio(folio, 0); 2601 /* 2602 * The read mmap_lock is held here. Despite the 2603 * mmap_lock being read recursive a deadlock is still 2604 * possible if a writer has taken a lock. For example: 2605 * 2606 * process A thread 1 takes read lock on own mmap_lock 2607 * process A thread 2 calls mmap, blocks taking write lock 2608 * process B thread 1 takes page fault, read lock on own mmap lock 2609 * process B thread 2 calls mmap, blocks taking write lock 2610 * process A thread 1 blocks taking read lock on process B 2611 * process B thread 1 blocks taking read lock on process A 2612 * 2613 * Disable page faults to prevent potential deadlock 2614 * and retry the copy outside the mmap_lock. 2615 */ 2616 pagefault_disable(); 2617 ret = copy_from_user(page_kaddr, 2618 (const void __user *)src_addr, 2619 PAGE_SIZE); 2620 pagefault_enable(); 2621 kunmap_local(page_kaddr); 2622 2623 /* fallback to copy_from_user outside mmap_lock */ 2624 if (unlikely(ret)) { 2625 *foliop = folio; 2626 ret = -ENOENT; 2627 /* don't free the page */ 2628 goto out_unacct_blocks; 2629 } 2630 2631 flush_dcache_folio(folio); 2632 } else { /* ZEROPAGE */ 2633 clear_user_highpage(&folio->page, dst_addr); 2634 } 2635 } else { 2636 folio = *foliop; 2637 VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 2638 *foliop = NULL; 2639 } 2640 2641 VM_BUG_ON(folio_test_locked(folio)); 2642 VM_BUG_ON(folio_test_swapbacked(folio)); 2643 __folio_set_locked(folio); 2644 __folio_set_swapbacked(folio); 2645 __folio_mark_uptodate(folio); 2646 2647 ret = -EFAULT; 2648 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2649 if (unlikely(pgoff >= max_off)) 2650 goto out_release; 2651 2652 ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); 2653 if (ret) 2654 goto out_release; 2655 ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); 2656 if (ret) 2657 goto out_release; 2658 2659 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 2660 &folio->page, true, flags); 2661 if (ret) 2662 goto out_delete_from_cache; 2663 2664 shmem_recalc_inode(inode, 1, 0); 2665 folio_unlock(folio); 2666 return 0; 2667 out_delete_from_cache: 2668 filemap_remove_folio(folio); 2669 out_release: 2670 folio_unlock(folio); 2671 folio_put(folio); 2672 out_unacct_blocks: 2673 shmem_inode_unacct_blocks(inode, 1); 2674 return ret; 2675 } 2676 #endif /* CONFIG_USERFAULTFD */ 2677 2678 #ifdef CONFIG_TMPFS 2679 static const struct inode_operations shmem_symlink_inode_operations; 2680 static const struct inode_operations shmem_short_symlink_operations; 2681 2682 static int 2683 shmem_write_begin(struct file *file, struct address_space *mapping, 2684 loff_t pos, unsigned len, 2685 struct page **pagep, void **fsdata) 2686 { 2687 struct inode *inode = mapping->host; 2688 struct shmem_inode_info *info = SHMEM_I(inode); 2689 pgoff_t index = pos >> PAGE_SHIFT; 2690 struct folio *folio; 2691 int ret = 0; 2692 2693 /* i_rwsem is held by caller */ 2694 if (unlikely(info->seals & (F_SEAL_GROW | 2695 F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { 2696 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) 2697 return -EPERM; 2698 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 2699 return -EPERM; 2700 } 2701 2702 ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); 2703 if (ret) 2704 return ret; 2705 2706 *pagep = folio_file_page(folio, index); 2707 if (PageHWPoison(*pagep)) { 2708 folio_unlock(folio); 2709 folio_put(folio); 2710 *pagep = NULL; 2711 return -EIO; 2712 } 2713 2714 return 0; 2715 } 2716 2717 static int 2718 shmem_write_end(struct file *file, struct address_space *mapping, 2719 loff_t pos, unsigned len, unsigned copied, 2720 struct page *page, void *fsdata) 2721 { 2722 struct folio *folio = page_folio(page); 2723 struct inode *inode = mapping->host; 2724 2725 if (pos + copied > inode->i_size) 2726 i_size_write(inode, pos + copied); 2727 2728 if (!folio_test_uptodate(folio)) { 2729 if (copied < folio_size(folio)) { 2730 size_t from = offset_in_folio(folio, pos); 2731 folio_zero_segments(folio, 0, from, 2732 from + copied, folio_size(folio)); 2733 } 2734 folio_mark_uptodate(folio); 2735 } 2736 folio_mark_dirty(folio); 2737 folio_unlock(folio); 2738 folio_put(folio); 2739 2740 return copied; 2741 } 2742 2743 static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 2744 { 2745 struct file *file = iocb->ki_filp; 2746 struct inode *inode = file_inode(file); 2747 struct address_space *mapping = inode->i_mapping; 2748 pgoff_t index; 2749 unsigned long offset; 2750 int error = 0; 2751 ssize_t retval = 0; 2752 loff_t *ppos = &iocb->ki_pos; 2753 2754 index = *ppos >> PAGE_SHIFT; 2755 offset = *ppos & ~PAGE_MASK; 2756 2757 for (;;) { 2758 struct folio *folio = NULL; 2759 struct page *page = NULL; 2760 pgoff_t end_index; 2761 unsigned long nr, ret; 2762 loff_t i_size = i_size_read(inode); 2763 2764 end_index = i_size >> PAGE_SHIFT; 2765 if (index > end_index) 2766 break; 2767 if (index == end_index) { 2768 nr = i_size & ~PAGE_MASK; 2769 if (nr <= offset) 2770 break; 2771 } 2772 2773 error = shmem_get_folio(inode, index, &folio, SGP_READ); 2774 if (error) { 2775 if (error == -EINVAL) 2776 error = 0; 2777 break; 2778 } 2779 if (folio) { 2780 folio_unlock(folio); 2781 2782 page = folio_file_page(folio, index); 2783 if (PageHWPoison(page)) { 2784 folio_put(folio); 2785 error = -EIO; 2786 break; 2787 } 2788 } 2789 2790 /* 2791 * We must evaluate after, since reads (unlike writes) 2792 * are called without i_rwsem protection against truncate 2793 */ 2794 nr = PAGE_SIZE; 2795 i_size = i_size_read(inode); 2796 end_index = i_size >> PAGE_SHIFT; 2797 if (index == end_index) { 2798 nr = i_size & ~PAGE_MASK; 2799 if (nr <= offset) { 2800 if (folio) 2801 folio_put(folio); 2802 break; 2803 } 2804 } 2805 nr -= offset; 2806 2807 if (folio) { 2808 /* 2809 * If users can be writing to this page using arbitrary 2810 * virtual addresses, take care about potential aliasing 2811 * before reading the page on the kernel side. 2812 */ 2813 if (mapping_writably_mapped(mapping)) 2814 flush_dcache_page(page); 2815 /* 2816 * Mark the page accessed if we read the beginning. 2817 */ 2818 if (!offset) 2819 folio_mark_accessed(folio); 2820 /* 2821 * Ok, we have the page, and it's up-to-date, so 2822 * now we can copy it to user space... 2823 */ 2824 ret = copy_page_to_iter(page, offset, nr, to); 2825 folio_put(folio); 2826 2827 } else if (user_backed_iter(to)) { 2828 /* 2829 * Copy to user tends to be so well optimized, but 2830 * clear_user() not so much, that it is noticeably 2831 * faster to copy the zero page instead of clearing. 2832 */ 2833 ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); 2834 } else { 2835 /* 2836 * But submitting the same page twice in a row to 2837 * splice() - or others? - can result in confusion: 2838 * so don't attempt that optimization on pipes etc. 2839 */ 2840 ret = iov_iter_zero(nr, to); 2841 } 2842 2843 retval += ret; 2844 offset += ret; 2845 index += offset >> PAGE_SHIFT; 2846 offset &= ~PAGE_MASK; 2847 2848 if (!iov_iter_count(to)) 2849 break; 2850 if (ret < nr) { 2851 error = -EFAULT; 2852 break; 2853 } 2854 cond_resched(); 2855 } 2856 2857 *ppos = ((loff_t) index << PAGE_SHIFT) + offset; 2858 file_accessed(file); 2859 return retval ? retval : error; 2860 } 2861 2862 static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 2863 { 2864 struct file *file = iocb->ki_filp; 2865 struct inode *inode = file->f_mapping->host; 2866 ssize_t ret; 2867 2868 inode_lock(inode); 2869 ret = generic_write_checks(iocb, from); 2870 if (ret <= 0) 2871 goto unlock; 2872 ret = file_remove_privs(file); 2873 if (ret) 2874 goto unlock; 2875 ret = file_update_time(file); 2876 if (ret) 2877 goto unlock; 2878 ret = generic_perform_write(iocb, from); 2879 unlock: 2880 inode_unlock(inode); 2881 return ret; 2882 } 2883 2884 static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, 2885 struct pipe_buffer *buf) 2886 { 2887 return true; 2888 } 2889 2890 static void zero_pipe_buf_release(struct pipe_inode_info *pipe, 2891 struct pipe_buffer *buf) 2892 { 2893 } 2894 2895 static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe, 2896 struct pipe_buffer *buf) 2897 { 2898 return false; 2899 } 2900 2901 static const struct pipe_buf_operations zero_pipe_buf_ops = { 2902 .release = zero_pipe_buf_release, 2903 .try_steal = zero_pipe_buf_try_steal, 2904 .get = zero_pipe_buf_get, 2905 }; 2906 2907 static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, 2908 loff_t fpos, size_t size) 2909 { 2910 size_t offset = fpos & ~PAGE_MASK; 2911 2912 size = min_t(size_t, size, PAGE_SIZE - offset); 2913 2914 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { 2915 struct pipe_buffer *buf = pipe_head_buf(pipe); 2916 2917 *buf = (struct pipe_buffer) { 2918 .ops = &zero_pipe_buf_ops, 2919 .page = ZERO_PAGE(0), 2920 .offset = offset, 2921 .len = size, 2922 }; 2923 pipe->head++; 2924 } 2925 2926 return size; 2927 } 2928 2929 static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, 2930 struct pipe_inode_info *pipe, 2931 size_t len, unsigned int flags) 2932 { 2933 struct inode *inode = file_inode(in); 2934 struct address_space *mapping = inode->i_mapping; 2935 struct folio *folio = NULL; 2936 size_t total_spliced = 0, used, npages, n, part; 2937 loff_t isize; 2938 int error = 0; 2939 2940 /* Work out how much data we can actually add into the pipe */ 2941 used = pipe_occupancy(pipe->head, pipe->tail); 2942 npages = max_t(ssize_t, pipe->max_usage - used, 0); 2943 len = min_t(size_t, len, npages * PAGE_SIZE); 2944 2945 do { 2946 if (*ppos >= i_size_read(inode)) 2947 break; 2948 2949 error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, 2950 SGP_READ); 2951 if (error) { 2952 if (error == -EINVAL) 2953 error = 0; 2954 break; 2955 } 2956 if (folio) { 2957 folio_unlock(folio); 2958 2959 if (folio_test_hwpoison(folio) || 2960 (folio_test_large(folio) && 2961 folio_test_has_hwpoisoned(folio))) { 2962 error = -EIO; 2963 break; 2964 } 2965 } 2966 2967 /* 2968 * i_size must be checked after we know the pages are Uptodate. 2969 * 2970 * Checking i_size after the check allows us to calculate 2971 * the correct value for "nr", which means the zero-filled 2972 * part of the page is not copied back to userspace (unless 2973 * another truncate extends the file - this is desired though). 2974 */ 2975 isize = i_size_read(inode); 2976 if (unlikely(*ppos >= isize)) 2977 break; 2978 part = min_t(loff_t, isize - *ppos, len); 2979 2980 if (folio) { 2981 /* 2982 * If users can be writing to this page using arbitrary 2983 * virtual addresses, take care about potential aliasing 2984 * before reading the page on the kernel side. 2985 */ 2986 if (mapping_writably_mapped(mapping)) 2987 flush_dcache_folio(folio); 2988 folio_mark_accessed(folio); 2989 /* 2990 * Ok, we have the page, and it's up-to-date, so we can 2991 * now splice it into the pipe. 2992 */ 2993 n = splice_folio_into_pipe(pipe, folio, *ppos, part); 2994 folio_put(folio); 2995 folio = NULL; 2996 } else { 2997 n = splice_zeropage_into_pipe(pipe, *ppos, part); 2998 } 2999 3000 if (!n) 3001 break; 3002 len -= n; 3003 total_spliced += n; 3004 *ppos += n; 3005 in->f_ra.prev_pos = *ppos; 3006 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 3007 break; 3008 3009 cond_resched(); 3010 } while (len); 3011 3012 if (folio) 3013 folio_put(folio); 3014 3015 file_accessed(in); 3016 return total_spliced ? total_spliced : error; 3017 } 3018 3019 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) 3020 { 3021 struct address_space *mapping = file->f_mapping; 3022 struct inode *inode = mapping->host; 3023 3024 if (whence != SEEK_DATA && whence != SEEK_HOLE) 3025 return generic_file_llseek_size(file, offset, whence, 3026 MAX_LFS_FILESIZE, i_size_read(inode)); 3027 if (offset < 0) 3028 return -ENXIO; 3029 3030 inode_lock(inode); 3031 /* We're holding i_rwsem so we can access i_size directly */ 3032 offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); 3033 if (offset >= 0) 3034 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); 3035 inode_unlock(inode); 3036 return offset; 3037 } 3038 3039 static long shmem_fallocate(struct file *file, int mode, loff_t offset, 3040 loff_t len) 3041 { 3042 struct inode *inode = file_inode(file); 3043 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 3044 struct shmem_inode_info *info = SHMEM_I(inode); 3045 struct shmem_falloc shmem_falloc; 3046 pgoff_t start, index, end, undo_fallocend; 3047 int error; 3048 3049 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 3050 return -EOPNOTSUPP; 3051 3052 inode_lock(inode); 3053 3054 if (mode & FALLOC_FL_PUNCH_HOLE) { 3055 struct address_space *mapping = file->f_mapping; 3056 loff_t unmap_start = round_up(offset, PAGE_SIZE); 3057 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 3058 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 3059 3060 /* protected by i_rwsem */ 3061 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { 3062 error = -EPERM; 3063 goto out; 3064 } 3065 3066 shmem_falloc.waitq = &shmem_falloc_waitq; 3067 shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; 3068 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; 3069 spin_lock(&inode->i_lock); 3070 inode->i_private = &shmem_falloc; 3071 spin_unlock(&inode->i_lock); 3072 3073 if ((u64)unmap_end > (u64)unmap_start) 3074 unmap_mapping_range(mapping, unmap_start, 3075 1 + unmap_end - unmap_start, 0); 3076 shmem_truncate_range(inode, offset, offset + len - 1); 3077 /* No need to unmap again: hole-punching leaves COWed pages */ 3078 3079 spin_lock(&inode->i_lock); 3080 inode->i_private = NULL; 3081 wake_up_all(&shmem_falloc_waitq); 3082 WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); 3083 spin_unlock(&inode->i_lock); 3084 error = 0; 3085 goto out; 3086 } 3087 3088 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 3089 error = inode_newsize_ok(inode, offset + len); 3090 if (error) 3091 goto out; 3092 3093 if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { 3094 error = -EPERM; 3095 goto out; 3096 } 3097 3098 start = offset >> PAGE_SHIFT; 3099 end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 3100 /* Try to avoid a swapstorm if len is impossible to satisfy */ 3101 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { 3102 error = -ENOSPC; 3103 goto out; 3104 } 3105 3106 shmem_falloc.waitq = NULL; 3107 shmem_falloc.start = start; 3108 shmem_falloc.next = start; 3109 shmem_falloc.nr_falloced = 0; 3110 shmem_falloc.nr_unswapped = 0; 3111 spin_lock(&inode->i_lock); 3112 inode->i_private = &shmem_falloc; 3113 spin_unlock(&inode->i_lock); 3114 3115 /* 3116 * info->fallocend is only relevant when huge pages might be 3117 * involved: to prevent split_huge_page() freeing fallocated 3118 * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. 3119 */ 3120 undo_fallocend = info->fallocend; 3121 if (info->fallocend < end) 3122 info->fallocend = end; 3123 3124 for (index = start; index < end; ) { 3125 struct folio *folio; 3126 3127 /* 3128 * Good, the fallocate(2) manpage permits EINTR: we may have 3129 * been interrupted because we are using up too much memory. 3130 */ 3131 if (signal_pending(current)) 3132 error = -EINTR; 3133 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) 3134 error = -ENOMEM; 3135 else 3136 error = shmem_get_folio(inode, index, &folio, 3137 SGP_FALLOC); 3138 if (error) { 3139 info->fallocend = undo_fallocend; 3140 /* Remove the !uptodate folios we added */ 3141 if (index > start) { 3142 shmem_undo_range(inode, 3143 (loff_t)start << PAGE_SHIFT, 3144 ((loff_t)index << PAGE_SHIFT) - 1, true); 3145 } 3146 goto undone; 3147 } 3148 3149 /* 3150 * Here is a more important optimization than it appears: 3151 * a second SGP_FALLOC on the same large folio will clear it, 3152 * making it uptodate and un-undoable if we fail later. 3153 */ 3154 index = folio_next_index(folio); 3155 /* Beware 32-bit wraparound */ 3156 if (!index) 3157 index--; 3158 3159 /* 3160 * Inform shmem_writepage() how far we have reached. 3161 * No need for lock or barrier: we have the page lock. 3162 */ 3163 if (!folio_test_uptodate(folio)) 3164 shmem_falloc.nr_falloced += index - shmem_falloc.next; 3165 shmem_falloc.next = index; 3166 3167 /* 3168 * If !uptodate, leave it that way so that freeable folios 3169 * can be recognized if we need to rollback on error later. 3170 * But mark it dirty so that memory pressure will swap rather 3171 * than free the folios we are allocating (and SGP_CACHE folios 3172 * might still be clean: we now need to mark those dirty too). 3173 */ 3174 folio_mark_dirty(folio); 3175 folio_unlock(folio); 3176 folio_put(folio); 3177 cond_resched(); 3178 } 3179 3180 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) 3181 i_size_write(inode, offset + len); 3182 undone: 3183 spin_lock(&inode->i_lock); 3184 inode->i_private = NULL; 3185 spin_unlock(&inode->i_lock); 3186 out: 3187 if (!error) 3188 file_modified(file); 3189 inode_unlock(inode); 3190 return error; 3191 } 3192 3193 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 3194 { 3195 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 3196 3197 buf->f_type = TMPFS_MAGIC; 3198 buf->f_bsize = PAGE_SIZE; 3199 buf->f_namelen = NAME_MAX; 3200 if (sbinfo->max_blocks) { 3201 buf->f_blocks = sbinfo->max_blocks; 3202 buf->f_bavail = 3203 buf->f_bfree = sbinfo->max_blocks - 3204 percpu_counter_sum(&sbinfo->used_blocks); 3205 } 3206 if (sbinfo->max_inodes) { 3207 buf->f_files = sbinfo->max_inodes; 3208 buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE; 3209 } 3210 /* else leave those fields 0 like simple_statfs */ 3211 3212 buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); 3213 3214 return 0; 3215 } 3216 3217 /* 3218 * File creation. Allocate an inode, and we're done.. 3219 */ 3220 static int 3221 shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, 3222 struct dentry *dentry, umode_t mode, dev_t dev) 3223 { 3224 struct inode *inode; 3225 int error; 3226 3227 inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); 3228 if (IS_ERR(inode)) 3229 return PTR_ERR(inode); 3230 3231 error = simple_acl_create(dir, inode); 3232 if (error) 3233 goto out_iput; 3234 error = security_inode_init_security(inode, dir, &dentry->d_name, 3235 shmem_initxattrs, NULL); 3236 if (error && error != -EOPNOTSUPP) 3237 goto out_iput; 3238 3239 error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); 3240 if (error) 3241 goto out_iput; 3242 3243 dir->i_size += BOGO_DIRENT_SIZE; 3244 inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); 3245 inode_inc_iversion(dir); 3246 d_instantiate(dentry, inode); 3247 dget(dentry); /* Extra count - pin the dentry in core */ 3248 return error; 3249 3250 out_iput: 3251 iput(inode); 3252 return error; 3253 } 3254 3255 static int 3256 shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, 3257 struct file *file, umode_t mode) 3258 { 3259 struct inode *inode; 3260 int error; 3261 3262 inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); 3263 if (IS_ERR(inode)) { 3264 error = PTR_ERR(inode); 3265 goto err_out; 3266 } 3267 error = security_inode_init_security(inode, dir, NULL, 3268 shmem_initxattrs, NULL); 3269 if (error && error != -EOPNOTSUPP) 3270 goto out_iput; 3271 error = simple_acl_create(dir, inode); 3272 if (error) 3273 goto out_iput; 3274 d_tmpfile(file, inode); 3275 3276 err_out: 3277 return finish_open_simple(file, error); 3278 out_iput: 3279 iput(inode); 3280 return error; 3281 } 3282 3283 static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, 3284 struct dentry *dentry, umode_t mode) 3285 { 3286 int error; 3287 3288 error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); 3289 if (error) 3290 return error; 3291 inc_nlink(dir); 3292 return 0; 3293 } 3294 3295 static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, 3296 struct dentry *dentry, umode_t mode, bool excl) 3297 { 3298 return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); 3299 } 3300 3301 /* 3302 * Link a file.. 3303 */ 3304 static int shmem_link(struct dentry *old_dentry, struct inode *dir, 3305 struct dentry *dentry) 3306 { 3307 struct inode *inode = d_inode(old_dentry); 3308 int ret = 0; 3309 3310 /* 3311 * No ordinary (disk based) filesystem counts links as inodes; 3312 * but each new link needs a new dentry, pinning lowmem, and 3313 * tmpfs dentries cannot be pruned until they are unlinked. 3314 * But if an O_TMPFILE file is linked into the tmpfs, the 3315 * first link must skip that, to get the accounting right. 3316 */ 3317 if (inode->i_nlink) { 3318 ret = shmem_reserve_inode(inode->i_sb, NULL); 3319 if (ret) 3320 goto out; 3321 } 3322 3323 ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); 3324 if (ret) { 3325 if (inode->i_nlink) 3326 shmem_free_inode(inode->i_sb, 0); 3327 goto out; 3328 } 3329 3330 dir->i_size += BOGO_DIRENT_SIZE; 3331 inode_set_mtime_to_ts(dir, 3332 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); 3333 inode_inc_iversion(dir); 3334 inc_nlink(inode); 3335 ihold(inode); /* New dentry reference */ 3336 dget(dentry); /* Extra pinning count for the created dentry */ 3337 d_instantiate(dentry, inode); 3338 out: 3339 return ret; 3340 } 3341 3342 static int shmem_unlink(struct inode *dir, struct dentry *dentry) 3343 { 3344 struct inode *inode = d_inode(dentry); 3345 3346 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 3347 shmem_free_inode(inode->i_sb, 0); 3348 3349 simple_offset_remove(shmem_get_offset_ctx(dir), dentry); 3350 3351 dir->i_size -= BOGO_DIRENT_SIZE; 3352 inode_set_mtime_to_ts(dir, 3353 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); 3354 inode_inc_iversion(dir); 3355 drop_nlink(inode); 3356 dput(dentry); /* Undo the count from "create" - does all the work */ 3357 return 0; 3358 } 3359 3360 static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 3361 { 3362 if (!simple_empty(dentry)) 3363 return -ENOTEMPTY; 3364 3365 drop_nlink(d_inode(dentry)); 3366 drop_nlink(dir); 3367 return shmem_unlink(dir, dentry); 3368 } 3369 3370 static int shmem_whiteout(struct mnt_idmap *idmap, 3371 struct inode *old_dir, struct dentry *old_dentry) 3372 { 3373 struct dentry *whiteout; 3374 int error; 3375 3376 whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); 3377 if (!whiteout) 3378 return -ENOMEM; 3379 3380 error = shmem_mknod(idmap, old_dir, whiteout, 3381 S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); 3382 dput(whiteout); 3383 if (error) 3384 return error; 3385 3386 /* 3387 * Cheat and hash the whiteout while the old dentry is still in 3388 * place, instead of playing games with FS_RENAME_DOES_D_MOVE. 3389 * 3390 * d_lookup() will consistently find one of them at this point, 3391 * not sure which one, but that isn't even important. 3392 */ 3393 d_rehash(whiteout); 3394 return 0; 3395 } 3396 3397 /* 3398 * The VFS layer already does all the dentry stuff for rename, 3399 * we just have to decrement the usage count for the target if 3400 * it exists so that the VFS layer correctly free's it when it 3401 * gets overwritten. 3402 */ 3403 static int shmem_rename2(struct mnt_idmap *idmap, 3404 struct inode *old_dir, struct dentry *old_dentry, 3405 struct inode *new_dir, struct dentry *new_dentry, 3406 unsigned int flags) 3407 { 3408 struct inode *inode = d_inode(old_dentry); 3409 int they_are_dirs = S_ISDIR(inode->i_mode); 3410 int error; 3411 3412 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 3413 return -EINVAL; 3414 3415 if (flags & RENAME_EXCHANGE) 3416 return simple_offset_rename_exchange(old_dir, old_dentry, 3417 new_dir, new_dentry); 3418 3419 if (!simple_empty(new_dentry)) 3420 return -ENOTEMPTY; 3421 3422 if (flags & RENAME_WHITEOUT) { 3423 error = shmem_whiteout(idmap, old_dir, old_dentry); 3424 if (error) 3425 return error; 3426 } 3427 3428 simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry); 3429 error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry); 3430 if (error) 3431 return error; 3432 3433 if (d_really_is_positive(new_dentry)) { 3434 (void) shmem_unlink(new_dir, new_dentry); 3435 if (they_are_dirs) { 3436 drop_nlink(d_inode(new_dentry)); 3437 drop_nlink(old_dir); 3438 } 3439 } else if (they_are_dirs) { 3440 drop_nlink(old_dir); 3441 inc_nlink(new_dir); 3442 } 3443 3444 old_dir->i_size -= BOGO_DIRENT_SIZE; 3445 new_dir->i_size += BOGO_DIRENT_SIZE; 3446 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); 3447 inode_inc_iversion(old_dir); 3448 inode_inc_iversion(new_dir); 3449 return 0; 3450 } 3451 3452 static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, 3453 struct dentry *dentry, const char *symname) 3454 { 3455 int error; 3456 int len; 3457 struct inode *inode; 3458 struct folio *folio; 3459 3460 len = strlen(symname) + 1; 3461 if (len > PAGE_SIZE) 3462 return -ENAMETOOLONG; 3463 3464 inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, 3465 VM_NORESERVE); 3466 if (IS_ERR(inode)) 3467 return PTR_ERR(inode); 3468 3469 error = security_inode_init_security(inode, dir, &dentry->d_name, 3470 shmem_initxattrs, NULL); 3471 if (error && error != -EOPNOTSUPP) 3472 goto out_iput; 3473 3474 error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); 3475 if (error) 3476 goto out_iput; 3477 3478 inode->i_size = len-1; 3479 if (len <= SHORT_SYMLINK_LEN) { 3480 inode->i_link = kmemdup(symname, len, GFP_KERNEL); 3481 if (!inode->i_link) { 3482 error = -ENOMEM; 3483 goto out_remove_offset; 3484 } 3485 inode->i_op = &shmem_short_symlink_operations; 3486 } else { 3487 inode_nohighmem(inode); 3488 error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); 3489 if (error) 3490 goto out_remove_offset; 3491 inode->i_mapping->a_ops = &shmem_aops; 3492 inode->i_op = &shmem_symlink_inode_operations; 3493 memcpy(folio_address(folio), symname, len); 3494 folio_mark_uptodate(folio); 3495 folio_mark_dirty(folio); 3496 folio_unlock(folio); 3497 folio_put(folio); 3498 } 3499 dir->i_size += BOGO_DIRENT_SIZE; 3500 inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); 3501 inode_inc_iversion(dir); 3502 d_instantiate(dentry, inode); 3503 dget(dentry); 3504 return 0; 3505 3506 out_remove_offset: 3507 simple_offset_remove(shmem_get_offset_ctx(dir), dentry); 3508 out_iput: 3509 iput(inode); 3510 return error; 3511 } 3512 3513 static void shmem_put_link(void *arg) 3514 { 3515 folio_mark_accessed(arg); 3516 folio_put(arg); 3517 } 3518 3519 static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, 3520 struct delayed_call *done) 3521 { 3522 struct folio *folio = NULL; 3523 int error; 3524 3525 if (!dentry) { 3526 folio = filemap_get_folio(inode->i_mapping, 0); 3527 if (IS_ERR(folio)) 3528 return ERR_PTR(-ECHILD); 3529 if (PageHWPoison(folio_page(folio, 0)) || 3530 !folio_test_uptodate(folio)) { 3531 folio_put(folio); 3532 return ERR_PTR(-ECHILD); 3533 } 3534 } else { 3535 error = shmem_get_folio(inode, 0, &folio, SGP_READ); 3536 if (error) 3537 return ERR_PTR(error); 3538 if (!folio) 3539 return ERR_PTR(-ECHILD); 3540 if (PageHWPoison(folio_page(folio, 0))) { 3541 folio_unlock(folio); 3542 folio_put(folio); 3543 return ERR_PTR(-ECHILD); 3544 } 3545 folio_unlock(folio); 3546 } 3547 set_delayed_call(done, shmem_put_link, folio); 3548 return folio_address(folio); 3549 } 3550 3551 #ifdef CONFIG_TMPFS_XATTR 3552 3553 static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) 3554 { 3555 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 3556 3557 fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); 3558 3559 return 0; 3560 } 3561 3562 static int shmem_fileattr_set(struct mnt_idmap *idmap, 3563 struct dentry *dentry, struct fileattr *fa) 3564 { 3565 struct inode *inode = d_inode(dentry); 3566 struct shmem_inode_info *info = SHMEM_I(inode); 3567 3568 if (fileattr_has_fsx(fa)) 3569 return -EOPNOTSUPP; 3570 if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) 3571 return -EOPNOTSUPP; 3572 3573 info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | 3574 (fa->flags & SHMEM_FL_USER_MODIFIABLE); 3575 3576 shmem_set_inode_flags(inode, info->fsflags); 3577 inode_set_ctime_current(inode); 3578 inode_inc_iversion(inode); 3579 return 0; 3580 } 3581 3582 /* 3583 * Superblocks without xattr inode operations may get some security.* xattr 3584 * support from the LSM "for free". As soon as we have any other xattrs 3585 * like ACLs, we also need to implement the security.* handlers at 3586 * filesystem level, though. 3587 */ 3588 3589 /* 3590 * Callback for security_inode_init_security() for acquiring xattrs. 3591 */ 3592 static int shmem_initxattrs(struct inode *inode, 3593 const struct xattr *xattr_array, void *fs_info) 3594 { 3595 struct shmem_inode_info *info = SHMEM_I(inode); 3596 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 3597 const struct xattr *xattr; 3598 struct simple_xattr *new_xattr; 3599 size_t ispace = 0; 3600 size_t len; 3601 3602 if (sbinfo->max_inodes) { 3603 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 3604 ispace += simple_xattr_space(xattr->name, 3605 xattr->value_len + XATTR_SECURITY_PREFIX_LEN); 3606 } 3607 if (ispace) { 3608 raw_spin_lock(&sbinfo->stat_lock); 3609 if (sbinfo->free_ispace < ispace) 3610 ispace = 0; 3611 else 3612 sbinfo->free_ispace -= ispace; 3613 raw_spin_unlock(&sbinfo->stat_lock); 3614 if (!ispace) 3615 return -ENOSPC; 3616 } 3617 } 3618 3619 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 3620 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); 3621 if (!new_xattr) 3622 break; 3623 3624 len = strlen(xattr->name) + 1; 3625 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, 3626 GFP_KERNEL_ACCOUNT); 3627 if (!new_xattr->name) { 3628 kvfree(new_xattr); 3629 break; 3630 } 3631 3632 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, 3633 XATTR_SECURITY_PREFIX_LEN); 3634 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 3635 xattr->name, len); 3636 3637 simple_xattr_add(&info->xattrs, new_xattr); 3638 } 3639 3640 if (xattr->name != NULL) { 3641 if (ispace) { 3642 raw_spin_lock(&sbinfo->stat_lock); 3643 sbinfo->free_ispace += ispace; 3644 raw_spin_unlock(&sbinfo->stat_lock); 3645 } 3646 simple_xattrs_free(&info->xattrs, NULL); 3647 return -ENOMEM; 3648 } 3649 3650 return 0; 3651 } 3652 3653 static int shmem_xattr_handler_get(const struct xattr_handler *handler, 3654 struct dentry *unused, struct inode *inode, 3655 const char *name, void *buffer, size_t size) 3656 { 3657 struct shmem_inode_info *info = SHMEM_I(inode); 3658 3659 name = xattr_full_name(handler, name); 3660 return simple_xattr_get(&info->xattrs, name, buffer, size); 3661 } 3662 3663 static int shmem_xattr_handler_set(const struct xattr_handler *handler, 3664 struct mnt_idmap *idmap, 3665 struct dentry *unused, struct inode *inode, 3666 const char *name, const void *value, 3667 size_t size, int flags) 3668 { 3669 struct shmem_inode_info *info = SHMEM_I(inode); 3670 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 3671 struct simple_xattr *old_xattr; 3672 size_t ispace = 0; 3673 3674 name = xattr_full_name(handler, name); 3675 if (value && sbinfo->max_inodes) { 3676 ispace = simple_xattr_space(name, size); 3677 raw_spin_lock(&sbinfo->stat_lock); 3678 if (sbinfo->free_ispace < ispace) 3679 ispace = 0; 3680 else 3681 sbinfo->free_ispace -= ispace; 3682 raw_spin_unlock(&sbinfo->stat_lock); 3683 if (!ispace) 3684 return -ENOSPC; 3685 } 3686 3687 old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); 3688 if (!IS_ERR(old_xattr)) { 3689 ispace = 0; 3690 if (old_xattr && sbinfo->max_inodes) 3691 ispace = simple_xattr_space(old_xattr->name, 3692 old_xattr->size); 3693 simple_xattr_free(old_xattr); 3694 old_xattr = NULL; 3695 inode_set_ctime_current(inode); 3696 inode_inc_iversion(inode); 3697 } 3698 if (ispace) { 3699 raw_spin_lock(&sbinfo->stat_lock); 3700 sbinfo->free_ispace += ispace; 3701 raw_spin_unlock(&sbinfo->stat_lock); 3702 } 3703 return PTR_ERR(old_xattr); 3704 } 3705 3706 static const struct xattr_handler shmem_security_xattr_handler = { 3707 .prefix = XATTR_SECURITY_PREFIX, 3708 .get = shmem_xattr_handler_get, 3709 .set = shmem_xattr_handler_set, 3710 }; 3711 3712 static const struct xattr_handler shmem_trusted_xattr_handler = { 3713 .prefix = XATTR_TRUSTED_PREFIX, 3714 .get = shmem_xattr_handler_get, 3715 .set = shmem_xattr_handler_set, 3716 }; 3717 3718 static const struct xattr_handler shmem_user_xattr_handler = { 3719 .prefix = XATTR_USER_PREFIX, 3720 .get = shmem_xattr_handler_get, 3721 .set = shmem_xattr_handler_set, 3722 }; 3723 3724 static const struct xattr_handler * const shmem_xattr_handlers[] = { 3725 &shmem_security_xattr_handler, 3726 &shmem_trusted_xattr_handler, 3727 &shmem_user_xattr_handler, 3728 NULL 3729 }; 3730 3731 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 3732 { 3733 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); 3734 return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); 3735 } 3736 #endif /* CONFIG_TMPFS_XATTR */ 3737 3738 static const struct inode_operations shmem_short_symlink_operations = { 3739 .getattr = shmem_getattr, 3740 .setattr = shmem_setattr, 3741 .get_link = simple_get_link, 3742 #ifdef CONFIG_TMPFS_XATTR 3743 .listxattr = shmem_listxattr, 3744 #endif 3745 }; 3746 3747 static const struct inode_operations shmem_symlink_inode_operations = { 3748 .getattr = shmem_getattr, 3749 .setattr = shmem_setattr, 3750 .get_link = shmem_get_link, 3751 #ifdef CONFIG_TMPFS_XATTR 3752 .listxattr = shmem_listxattr, 3753 #endif 3754 }; 3755 3756 static struct dentry *shmem_get_parent(struct dentry *child) 3757 { 3758 return ERR_PTR(-ESTALE); 3759 } 3760 3761 static int shmem_match(struct inode *ino, void *vfh) 3762 { 3763 __u32 *fh = vfh; 3764 __u64 inum = fh[2]; 3765 inum = (inum << 32) | fh[1]; 3766 return ino->i_ino == inum && fh[0] == ino->i_generation; 3767 } 3768 3769 /* Find any alias of inode, but prefer a hashed alias */ 3770 static struct dentry *shmem_find_alias(struct inode *inode) 3771 { 3772 struct dentry *alias = d_find_alias(inode); 3773 3774 return alias ?: d_find_any_alias(inode); 3775 } 3776 3777 static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 3778 struct fid *fid, int fh_len, int fh_type) 3779 { 3780 struct inode *inode; 3781 struct dentry *dentry = NULL; 3782 u64 inum; 3783 3784 if (fh_len < 3) 3785 return NULL; 3786 3787 inum = fid->raw[2]; 3788 inum = (inum << 32) | fid->raw[1]; 3789 3790 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 3791 shmem_match, fid->raw); 3792 if (inode) { 3793 dentry = shmem_find_alias(inode); 3794 iput(inode); 3795 } 3796 3797 return dentry; 3798 } 3799 3800 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, 3801 struct inode *parent) 3802 { 3803 if (*len < 3) { 3804 *len = 3; 3805 return FILEID_INVALID; 3806 } 3807 3808 if (inode_unhashed(inode)) { 3809 /* Unfortunately insert_inode_hash is not idempotent, 3810 * so as we hash inodes here rather than at creation 3811 * time, we need a lock to ensure we only try 3812 * to do it once 3813 */ 3814 static DEFINE_SPINLOCK(lock); 3815 spin_lock(&lock); 3816 if (inode_unhashed(inode)) 3817 __insert_inode_hash(inode, 3818 inode->i_ino + inode->i_generation); 3819 spin_unlock(&lock); 3820 } 3821 3822 fh[0] = inode->i_generation; 3823 fh[1] = inode->i_ino; 3824 fh[2] = ((__u64)inode->i_ino) >> 32; 3825 3826 *len = 3; 3827 return 1; 3828 } 3829 3830 static const struct export_operations shmem_export_ops = { 3831 .get_parent = shmem_get_parent, 3832 .encode_fh = shmem_encode_fh, 3833 .fh_to_dentry = shmem_fh_to_dentry, 3834 }; 3835 3836 enum shmem_param { 3837 Opt_gid, 3838 Opt_huge, 3839 Opt_mode, 3840 Opt_mpol, 3841 Opt_nr_blocks, 3842 Opt_nr_inodes, 3843 Opt_size, 3844 Opt_uid, 3845 Opt_inode32, 3846 Opt_inode64, 3847 Opt_noswap, 3848 Opt_quota, 3849 Opt_usrquota, 3850 Opt_grpquota, 3851 Opt_usrquota_block_hardlimit, 3852 Opt_usrquota_inode_hardlimit, 3853 Opt_grpquota_block_hardlimit, 3854 Opt_grpquota_inode_hardlimit, 3855 }; 3856 3857 static const struct constant_table shmem_param_enums_huge[] = { 3858 {"never", SHMEM_HUGE_NEVER }, 3859 {"always", SHMEM_HUGE_ALWAYS }, 3860 {"within_size", SHMEM_HUGE_WITHIN_SIZE }, 3861 {"advise", SHMEM_HUGE_ADVISE }, 3862 {} 3863 }; 3864 3865 const struct fs_parameter_spec shmem_fs_parameters[] = { 3866 fsparam_u32 ("gid", Opt_gid), 3867 fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), 3868 fsparam_u32oct("mode", Opt_mode), 3869 fsparam_string("mpol", Opt_mpol), 3870 fsparam_string("nr_blocks", Opt_nr_blocks), 3871 fsparam_string("nr_inodes", Opt_nr_inodes), 3872 fsparam_string("size", Opt_size), 3873 fsparam_u32 ("uid", Opt_uid), 3874 fsparam_flag ("inode32", Opt_inode32), 3875 fsparam_flag ("inode64", Opt_inode64), 3876 fsparam_flag ("noswap", Opt_noswap), 3877 #ifdef CONFIG_TMPFS_QUOTA 3878 fsparam_flag ("quota", Opt_quota), 3879 fsparam_flag ("usrquota", Opt_usrquota), 3880 fsparam_flag ("grpquota", Opt_grpquota), 3881 fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit), 3882 fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit), 3883 fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), 3884 fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), 3885 #endif 3886 {} 3887 }; 3888 3889 static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) 3890 { 3891 struct shmem_options *ctx = fc->fs_private; 3892 struct fs_parse_result result; 3893 unsigned long long size; 3894 char *rest; 3895 int opt; 3896 kuid_t kuid; 3897 kgid_t kgid; 3898 3899 opt = fs_parse(fc, shmem_fs_parameters, param, &result); 3900 if (opt < 0) 3901 return opt; 3902 3903 switch (opt) { 3904 case Opt_size: 3905 size = memparse(param->string, &rest); 3906 if (*rest == '%') { 3907 size <<= PAGE_SHIFT; 3908 size *= totalram_pages(); 3909 do_div(size, 100); 3910 rest++; 3911 } 3912 if (*rest) 3913 goto bad_value; 3914 ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); 3915 ctx->seen |= SHMEM_SEEN_BLOCKS; 3916 break; 3917 case Opt_nr_blocks: 3918 ctx->blocks = memparse(param->string, &rest); 3919 if (*rest || ctx->blocks > LONG_MAX) 3920 goto bad_value; 3921 ctx->seen |= SHMEM_SEEN_BLOCKS; 3922 break; 3923 case Opt_nr_inodes: 3924 ctx->inodes = memparse(param->string, &rest); 3925 if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE) 3926 goto bad_value; 3927 ctx->seen |= SHMEM_SEEN_INODES; 3928 break; 3929 case Opt_mode: 3930 ctx->mode = result.uint_32 & 07777; 3931 break; 3932 case Opt_uid: 3933 kuid = make_kuid(current_user_ns(), result.uint_32); 3934 if (!uid_valid(kuid)) 3935 goto bad_value; 3936 3937 /* 3938 * The requested uid must be representable in the 3939 * filesystem's idmapping. 3940 */ 3941 if (!kuid_has_mapping(fc->user_ns, kuid)) 3942 goto bad_value; 3943 3944 ctx->uid = kuid; 3945 break; 3946 case Opt_gid: 3947 kgid = make_kgid(current_user_ns(), result.uint_32); 3948 if (!gid_valid(kgid)) 3949 goto bad_value; 3950 3951 /* 3952 * The requested gid must be representable in the 3953 * filesystem's idmapping. 3954 */ 3955 if (!kgid_has_mapping(fc->user_ns, kgid)) 3956 goto bad_value; 3957 3958 ctx->gid = kgid; 3959 break; 3960 case Opt_huge: 3961 ctx->huge = result.uint_32; 3962 if (ctx->huge != SHMEM_HUGE_NEVER && 3963 !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 3964 has_transparent_hugepage())) 3965 goto unsupported_parameter; 3966 ctx->seen |= SHMEM_SEEN_HUGE; 3967 break; 3968 case Opt_mpol: 3969 if (IS_ENABLED(CONFIG_NUMA)) { 3970 mpol_put(ctx->mpol); 3971 ctx->mpol = NULL; 3972 if (mpol_parse_str(param->string, &ctx->mpol)) 3973 goto bad_value; 3974 break; 3975 } 3976 goto unsupported_parameter; 3977 case Opt_inode32: 3978 ctx->full_inums = false; 3979 ctx->seen |= SHMEM_SEEN_INUMS; 3980 break; 3981 case Opt_inode64: 3982 if (sizeof(ino_t) < 8) { 3983 return invalfc(fc, 3984 "Cannot use inode64 with <64bit inums in kernel\n"); 3985 } 3986 ctx->full_inums = true; 3987 ctx->seen |= SHMEM_SEEN_INUMS; 3988 break; 3989 case Opt_noswap: 3990 if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) { 3991 return invalfc(fc, 3992 "Turning off swap in unprivileged tmpfs mounts unsupported"); 3993 } 3994 ctx->noswap = true; 3995 ctx->seen |= SHMEM_SEEN_NOSWAP; 3996 break; 3997 case Opt_quota: 3998 if (fc->user_ns != &init_user_ns) 3999 return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); 4000 ctx->seen |= SHMEM_SEEN_QUOTA; 4001 ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP); 4002 break; 4003 case Opt_usrquota: 4004 if (fc->user_ns != &init_user_ns) 4005 return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); 4006 ctx->seen |= SHMEM_SEEN_QUOTA; 4007 ctx->quota_types |= QTYPE_MASK_USR; 4008 break; 4009 case Opt_grpquota: 4010 if (fc->user_ns != &init_user_ns) 4011 return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); 4012 ctx->seen |= SHMEM_SEEN_QUOTA; 4013 ctx->quota_types |= QTYPE_MASK_GRP; 4014 break; 4015 case Opt_usrquota_block_hardlimit: 4016 size = memparse(param->string, &rest); 4017 if (*rest || !size) 4018 goto bad_value; 4019 if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) 4020 return invalfc(fc, 4021 "User quota block hardlimit too large."); 4022 ctx->qlimits.usrquota_bhardlimit = size; 4023 break; 4024 case Opt_grpquota_block_hardlimit: 4025 size = memparse(param->string, &rest); 4026 if (*rest || !size) 4027 goto bad_value; 4028 if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) 4029 return invalfc(fc, 4030 "Group quota block hardlimit too large."); 4031 ctx->qlimits.grpquota_bhardlimit = size; 4032 break; 4033 case Opt_usrquota_inode_hardlimit: 4034 size = memparse(param->string, &rest); 4035 if (*rest || !size) 4036 goto bad_value; 4037 if (size > SHMEM_QUOTA_MAX_INO_LIMIT) 4038 return invalfc(fc, 4039 "User quota inode hardlimit too large."); 4040 ctx->qlimits.usrquota_ihardlimit = size; 4041 break; 4042 case Opt_grpquota_inode_hardlimit: 4043 size = memparse(param->string, &rest); 4044 if (*rest || !size) 4045 goto bad_value; 4046 if (size > SHMEM_QUOTA_MAX_INO_LIMIT) 4047 return invalfc(fc, 4048 "Group quota inode hardlimit too large."); 4049 ctx->qlimits.grpquota_ihardlimit = size; 4050 break; 4051 } 4052 return 0; 4053 4054 unsupported_parameter: 4055 return invalfc(fc, "Unsupported parameter '%s'", param->key); 4056 bad_value: 4057 return invalfc(fc, "Bad value for '%s'", param->key); 4058 } 4059 4060 static int shmem_parse_options(struct fs_context *fc, void *data) 4061 { 4062 char *options = data; 4063 4064 if (options) { 4065 int err = security_sb_eat_lsm_opts(options, &fc->security); 4066 if (err) 4067 return err; 4068 } 4069 4070 while (options != NULL) { 4071 char *this_char = options; 4072 for (;;) { 4073 /* 4074 * NUL-terminate this option: unfortunately, 4075 * mount options form a comma-separated list, 4076 * but mpol's nodelist may also contain commas. 4077 */ 4078 options = strchr(options, ','); 4079 if (options == NULL) 4080 break; 4081 options++; 4082 if (!isdigit(*options)) { 4083 options[-1] = '\0'; 4084 break; 4085 } 4086 } 4087 if (*this_char) { 4088 char *value = strchr(this_char, '='); 4089 size_t len = 0; 4090 int err; 4091 4092 if (value) { 4093 *value++ = '\0'; 4094 len = strlen(value); 4095 } 4096 err = vfs_parse_fs_string(fc, this_char, value, len); 4097 if (err < 0) 4098 return err; 4099 } 4100 } 4101 return 0; 4102 } 4103 4104 /* 4105 * Reconfigure a shmem filesystem. 4106 */ 4107 static int shmem_reconfigure(struct fs_context *fc) 4108 { 4109 struct shmem_options *ctx = fc->fs_private; 4110 struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); 4111 unsigned long used_isp; 4112 struct mempolicy *mpol = NULL; 4113 const char *err; 4114 4115 raw_spin_lock(&sbinfo->stat_lock); 4116 used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace; 4117 4118 if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { 4119 if (!sbinfo->max_blocks) { 4120 err = "Cannot retroactively limit size"; 4121 goto out; 4122 } 4123 if (percpu_counter_compare(&sbinfo->used_blocks, 4124 ctx->blocks) > 0) { 4125 err = "Too small a size for current use"; 4126 goto out; 4127 } 4128 } 4129 if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { 4130 if (!sbinfo->max_inodes) { 4131 err = "Cannot retroactively limit inodes"; 4132 goto out; 4133 } 4134 if (ctx->inodes * BOGO_INODE_SIZE < used_isp) { 4135 err = "Too few inodes for current use"; 4136 goto out; 4137 } 4138 } 4139 4140 if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && 4141 sbinfo->next_ino > UINT_MAX) { 4142 err = "Current inum too high to switch to 32-bit inums"; 4143 goto out; 4144 } 4145 if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) { 4146 err = "Cannot disable swap on remount"; 4147 goto out; 4148 } 4149 if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) { 4150 err = "Cannot enable swap on remount if it was disabled on first mount"; 4151 goto out; 4152 } 4153 4154 if (ctx->seen & SHMEM_SEEN_QUOTA && 4155 !sb_any_quota_loaded(fc->root->d_sb)) { 4156 err = "Cannot enable quota on remount"; 4157 goto out; 4158 } 4159 4160 #ifdef CONFIG_TMPFS_QUOTA 4161 #define CHANGED_LIMIT(name) \ 4162 (ctx->qlimits.name## hardlimit && \ 4163 (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit)) 4164 4165 if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) || 4166 CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) { 4167 err = "Cannot change global quota limit on remount"; 4168 goto out; 4169 } 4170 #endif /* CONFIG_TMPFS_QUOTA */ 4171 4172 if (ctx->seen & SHMEM_SEEN_HUGE) 4173 sbinfo->huge = ctx->huge; 4174 if (ctx->seen & SHMEM_SEEN_INUMS) 4175 sbinfo->full_inums = ctx->full_inums; 4176 if (ctx->seen & SHMEM_SEEN_BLOCKS) 4177 sbinfo->max_blocks = ctx->blocks; 4178 if (ctx->seen & SHMEM_SEEN_INODES) { 4179 sbinfo->max_inodes = ctx->inodes; 4180 sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp; 4181 } 4182 4183 /* 4184 * Preserve previous mempolicy unless mpol remount option was specified. 4185 */ 4186 if (ctx->mpol) { 4187 mpol = sbinfo->mpol; 4188 sbinfo->mpol = ctx->mpol; /* transfers initial ref */ 4189 ctx->mpol = NULL; 4190 } 4191 4192 if (ctx->noswap) 4193 sbinfo->noswap = true; 4194 4195 raw_spin_unlock(&sbinfo->stat_lock); 4196 mpol_put(mpol); 4197 return 0; 4198 out: 4199 raw_spin_unlock(&sbinfo->stat_lock); 4200 return invalfc(fc, "%s", err); 4201 } 4202 4203 static int shmem_show_options(struct seq_file *seq, struct dentry *root) 4204 { 4205 struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); 4206 struct mempolicy *mpol; 4207 4208 if (sbinfo->max_blocks != shmem_default_max_blocks()) 4209 seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks)); 4210 if (sbinfo->max_inodes != shmem_default_max_inodes()) 4211 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 4212 if (sbinfo->mode != (0777 | S_ISVTX)) 4213 seq_printf(seq, ",mode=%03ho", sbinfo->mode); 4214 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) 4215 seq_printf(seq, ",uid=%u", 4216 from_kuid_munged(&init_user_ns, sbinfo->uid)); 4217 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 4218 seq_printf(seq, ",gid=%u", 4219 from_kgid_munged(&init_user_ns, sbinfo->gid)); 4220 4221 /* 4222 * Showing inode{64,32} might be useful even if it's the system default, 4223 * since then people don't have to resort to checking both here and 4224 * /proc/config.gz to confirm 64-bit inums were successfully applied 4225 * (which may not even exist if IKCONFIG_PROC isn't enabled). 4226 * 4227 * We hide it when inode64 isn't the default and we are using 32-bit 4228 * inodes, since that probably just means the feature isn't even under 4229 * consideration. 4230 * 4231 * As such: 4232 * 4233 * +-----------------+-----------------+ 4234 * | TMPFS_INODE64=y | TMPFS_INODE64=n | 4235 * +------------------+-----------------+-----------------+ 4236 * | full_inums=true | show | show | 4237 * | full_inums=false | show | hide | 4238 * +------------------+-----------------+-----------------+ 4239 * 4240 */ 4241 if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) 4242 seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); 4243 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4244 /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ 4245 if (sbinfo->huge) 4246 seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); 4247 #endif 4248 mpol = shmem_get_sbmpol(sbinfo); 4249 shmem_show_mpol(seq, mpol); 4250 mpol_put(mpol); 4251 if (sbinfo->noswap) 4252 seq_printf(seq, ",noswap"); 4253 return 0; 4254 } 4255 4256 #endif /* CONFIG_TMPFS */ 4257 4258 static void shmem_put_super(struct super_block *sb) 4259 { 4260 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 4261 4262 #ifdef CONFIG_TMPFS_QUOTA 4263 shmem_disable_quotas(sb); 4264 #endif 4265 free_percpu(sbinfo->ino_batch); 4266 percpu_counter_destroy(&sbinfo->used_blocks); 4267 mpol_put(sbinfo->mpol); 4268 kfree(sbinfo); 4269 sb->s_fs_info = NULL; 4270 } 4271 4272 static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) 4273 { 4274 struct shmem_options *ctx = fc->fs_private; 4275 struct inode *inode; 4276 struct shmem_sb_info *sbinfo; 4277 int error = -ENOMEM; 4278 4279 /* Round up to L1_CACHE_BYTES to resist false sharing */ 4280 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 4281 L1_CACHE_BYTES), GFP_KERNEL); 4282 if (!sbinfo) 4283 return error; 4284 4285 sb->s_fs_info = sbinfo; 4286 4287 #ifdef CONFIG_TMPFS 4288 /* 4289 * Per default we only allow half of the physical ram per 4290 * tmpfs instance, limiting inodes to one per page of lowmem; 4291 * but the internal instance is left unlimited. 4292 */ 4293 if (!(sb->s_flags & SB_KERNMOUNT)) { 4294 if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) 4295 ctx->blocks = shmem_default_max_blocks(); 4296 if (!(ctx->seen & SHMEM_SEEN_INODES)) 4297 ctx->inodes = shmem_default_max_inodes(); 4298 if (!(ctx->seen & SHMEM_SEEN_INUMS)) 4299 ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); 4300 sbinfo->noswap = ctx->noswap; 4301 } else { 4302 sb->s_flags |= SB_NOUSER; 4303 } 4304 sb->s_export_op = &shmem_export_ops; 4305 sb->s_flags |= SB_NOSEC | SB_I_VERSION; 4306 #else 4307 sb->s_flags |= SB_NOUSER; 4308 #endif 4309 sbinfo->max_blocks = ctx->blocks; 4310 sbinfo->max_inodes = ctx->inodes; 4311 sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; 4312 if (sb->s_flags & SB_KERNMOUNT) { 4313 sbinfo->ino_batch = alloc_percpu(ino_t); 4314 if (!sbinfo->ino_batch) 4315 goto failed; 4316 } 4317 sbinfo->uid = ctx->uid; 4318 sbinfo->gid = ctx->gid; 4319 sbinfo->full_inums = ctx->full_inums; 4320 sbinfo->mode = ctx->mode; 4321 sbinfo->huge = ctx->huge; 4322 sbinfo->mpol = ctx->mpol; 4323 ctx->mpol = NULL; 4324 4325 raw_spin_lock_init(&sbinfo->stat_lock); 4326 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) 4327 goto failed; 4328 spin_lock_init(&sbinfo->shrinklist_lock); 4329 INIT_LIST_HEAD(&sbinfo->shrinklist); 4330 4331 sb->s_maxbytes = MAX_LFS_FILESIZE; 4332 sb->s_blocksize = PAGE_SIZE; 4333 sb->s_blocksize_bits = PAGE_SHIFT; 4334 sb->s_magic = TMPFS_MAGIC; 4335 sb->s_op = &shmem_ops; 4336 sb->s_time_gran = 1; 4337 #ifdef CONFIG_TMPFS_XATTR 4338 sb->s_xattr = shmem_xattr_handlers; 4339 #endif 4340 #ifdef CONFIG_TMPFS_POSIX_ACL 4341 sb->s_flags |= SB_POSIXACL; 4342 #endif 4343 uuid_gen(&sb->s_uuid); 4344 4345 #ifdef CONFIG_TMPFS_QUOTA 4346 if (ctx->seen & SHMEM_SEEN_QUOTA) { 4347 sb->dq_op = &shmem_quota_operations; 4348 sb->s_qcop = &dquot_quotactl_sysfile_ops; 4349 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; 4350 4351 /* Copy the default limits from ctx into sbinfo */ 4352 memcpy(&sbinfo->qlimits, &ctx->qlimits, 4353 sizeof(struct shmem_quota_limits)); 4354 4355 if (shmem_enable_quotas(sb, ctx->quota_types)) 4356 goto failed; 4357 } 4358 #endif /* CONFIG_TMPFS_QUOTA */ 4359 4360 inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, 4361 S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); 4362 if (IS_ERR(inode)) { 4363 error = PTR_ERR(inode); 4364 goto failed; 4365 } 4366 inode->i_uid = sbinfo->uid; 4367 inode->i_gid = sbinfo->gid; 4368 sb->s_root = d_make_root(inode); 4369 if (!sb->s_root) 4370 goto failed; 4371 return 0; 4372 4373 failed: 4374 shmem_put_super(sb); 4375 return error; 4376 } 4377 4378 static int shmem_get_tree(struct fs_context *fc) 4379 { 4380 return get_tree_nodev(fc, shmem_fill_super); 4381 } 4382 4383 static void shmem_free_fc(struct fs_context *fc) 4384 { 4385 struct shmem_options *ctx = fc->fs_private; 4386 4387 if (ctx) { 4388 mpol_put(ctx->mpol); 4389 kfree(ctx); 4390 } 4391 } 4392 4393 static const struct fs_context_operations shmem_fs_context_ops = { 4394 .free = shmem_free_fc, 4395 .get_tree = shmem_get_tree, 4396 #ifdef CONFIG_TMPFS 4397 .parse_monolithic = shmem_parse_options, 4398 .parse_param = shmem_parse_one, 4399 .reconfigure = shmem_reconfigure, 4400 #endif 4401 }; 4402 4403 static struct kmem_cache *shmem_inode_cachep __ro_after_init; 4404 4405 static struct inode *shmem_alloc_inode(struct super_block *sb) 4406 { 4407 struct shmem_inode_info *info; 4408 info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); 4409 if (!info) 4410 return NULL; 4411 return &info->vfs_inode; 4412 } 4413 4414 static void shmem_free_in_core_inode(struct inode *inode) 4415 { 4416 if (S_ISLNK(inode->i_mode)) 4417 kfree(inode->i_link); 4418 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 4419 } 4420 4421 static void shmem_destroy_inode(struct inode *inode) 4422 { 4423 if (S_ISREG(inode->i_mode)) 4424 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 4425 if (S_ISDIR(inode->i_mode)) 4426 simple_offset_destroy(shmem_get_offset_ctx(inode)); 4427 } 4428 4429 static void shmem_init_inode(void *foo) 4430 { 4431 struct shmem_inode_info *info = foo; 4432 inode_init_once(&info->vfs_inode); 4433 } 4434 4435 static void __init shmem_init_inodecache(void) 4436 { 4437 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 4438 sizeof(struct shmem_inode_info), 4439 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); 4440 } 4441 4442 static void __init shmem_destroy_inodecache(void) 4443 { 4444 kmem_cache_destroy(shmem_inode_cachep); 4445 } 4446 4447 /* Keep the page in page cache instead of truncating it */ 4448 static int shmem_error_remove_page(struct address_space *mapping, 4449 struct page *page) 4450 { 4451 return 0; 4452 } 4453 4454 const struct address_space_operations shmem_aops = { 4455 .writepage = shmem_writepage, 4456 .dirty_folio = noop_dirty_folio, 4457 #ifdef CONFIG_TMPFS 4458 .write_begin = shmem_write_begin, 4459 .write_end = shmem_write_end, 4460 #endif 4461 #ifdef CONFIG_MIGRATION 4462 .migrate_folio = migrate_folio, 4463 #endif 4464 .error_remove_page = shmem_error_remove_page, 4465 }; 4466 EXPORT_SYMBOL(shmem_aops); 4467 4468 static const struct file_operations shmem_file_operations = { 4469 .mmap = shmem_mmap, 4470 .open = shmem_file_open, 4471 .get_unmapped_area = shmem_get_unmapped_area, 4472 #ifdef CONFIG_TMPFS 4473 .llseek = shmem_file_llseek, 4474 .read_iter = shmem_file_read_iter, 4475 .write_iter = shmem_file_write_iter, 4476 .fsync = noop_fsync, 4477 .splice_read = shmem_file_splice_read, 4478 .splice_write = iter_file_splice_write, 4479 .fallocate = shmem_fallocate, 4480 #endif 4481 }; 4482 4483 static const struct inode_operations shmem_inode_operations = { 4484 .getattr = shmem_getattr, 4485 .setattr = shmem_setattr, 4486 #ifdef CONFIG_TMPFS_XATTR 4487 .listxattr = shmem_listxattr, 4488 .set_acl = simple_set_acl, 4489 .fileattr_get = shmem_fileattr_get, 4490 .fileattr_set = shmem_fileattr_set, 4491 #endif 4492 }; 4493 4494 static const struct inode_operations shmem_dir_inode_operations = { 4495 #ifdef CONFIG_TMPFS 4496 .getattr = shmem_getattr, 4497 .create = shmem_create, 4498 .lookup = simple_lookup, 4499 .link = shmem_link, 4500 .unlink = shmem_unlink, 4501 .symlink = shmem_symlink, 4502 .mkdir = shmem_mkdir, 4503 .rmdir = shmem_rmdir, 4504 .mknod = shmem_mknod, 4505 .rename = shmem_rename2, 4506 .tmpfile = shmem_tmpfile, 4507 .get_offset_ctx = shmem_get_offset_ctx, 4508 #endif 4509 #ifdef CONFIG_TMPFS_XATTR 4510 .listxattr = shmem_listxattr, 4511 .fileattr_get = shmem_fileattr_get, 4512 .fileattr_set = shmem_fileattr_set, 4513 #endif 4514 #ifdef CONFIG_TMPFS_POSIX_ACL 4515 .setattr = shmem_setattr, 4516 .set_acl = simple_set_acl, 4517 #endif 4518 }; 4519 4520 static const struct inode_operations shmem_special_inode_operations = { 4521 .getattr = shmem_getattr, 4522 #ifdef CONFIG_TMPFS_XATTR 4523 .listxattr = shmem_listxattr, 4524 #endif 4525 #ifdef CONFIG_TMPFS_POSIX_ACL 4526 .setattr = shmem_setattr, 4527 .set_acl = simple_set_acl, 4528 #endif 4529 }; 4530 4531 static const struct super_operations shmem_ops = { 4532 .alloc_inode = shmem_alloc_inode, 4533 .free_inode = shmem_free_in_core_inode, 4534 .destroy_inode = shmem_destroy_inode, 4535 #ifdef CONFIG_TMPFS 4536 .statfs = shmem_statfs, 4537 .show_options = shmem_show_options, 4538 #endif 4539 #ifdef CONFIG_TMPFS_QUOTA 4540 .get_dquots = shmem_get_dquots, 4541 #endif 4542 .evict_inode = shmem_evict_inode, 4543 .drop_inode = generic_delete_inode, 4544 .put_super = shmem_put_super, 4545 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4546 .nr_cached_objects = shmem_unused_huge_count, 4547 .free_cached_objects = shmem_unused_huge_scan, 4548 #endif 4549 }; 4550 4551 static const struct vm_operations_struct shmem_vm_ops = { 4552 .fault = shmem_fault, 4553 .map_pages = filemap_map_pages, 4554 #ifdef CONFIG_NUMA 4555 .set_policy = shmem_set_policy, 4556 .get_policy = shmem_get_policy, 4557 #endif 4558 }; 4559 4560 static const struct vm_operations_struct shmem_anon_vm_ops = { 4561 .fault = shmem_fault, 4562 .map_pages = filemap_map_pages, 4563 #ifdef CONFIG_NUMA 4564 .set_policy = shmem_set_policy, 4565 .get_policy = shmem_get_policy, 4566 #endif 4567 }; 4568 4569 int shmem_init_fs_context(struct fs_context *fc) 4570 { 4571 struct shmem_options *ctx; 4572 4573 ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL); 4574 if (!ctx) 4575 return -ENOMEM; 4576 4577 ctx->mode = 0777 | S_ISVTX; 4578 ctx->uid = current_fsuid(); 4579 ctx->gid = current_fsgid(); 4580 4581 fc->fs_private = ctx; 4582 fc->ops = &shmem_fs_context_ops; 4583 return 0; 4584 } 4585 4586 static struct file_system_type shmem_fs_type = { 4587 .owner = THIS_MODULE, 4588 .name = "tmpfs", 4589 .init_fs_context = shmem_init_fs_context, 4590 #ifdef CONFIG_TMPFS 4591 .parameters = shmem_fs_parameters, 4592 #endif 4593 .kill_sb = kill_litter_super, 4594 .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, 4595 }; 4596 4597 void __init shmem_init(void) 4598 { 4599 int error; 4600 4601 shmem_init_inodecache(); 4602 4603 #ifdef CONFIG_TMPFS_QUOTA 4604 error = register_quota_format(&shmem_quota_format); 4605 if (error < 0) { 4606 pr_err("Could not register quota format\n"); 4607 goto out3; 4608 } 4609 #endif 4610 4611 error = register_filesystem(&shmem_fs_type); 4612 if (error) { 4613 pr_err("Could not register tmpfs\n"); 4614 goto out2; 4615 } 4616 4617 shm_mnt = kern_mount(&shmem_fs_type); 4618 if (IS_ERR(shm_mnt)) { 4619 error = PTR_ERR(shm_mnt); 4620 pr_err("Could not kern_mount tmpfs\n"); 4621 goto out1; 4622 } 4623 4624 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4625 if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) 4626 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 4627 else 4628 shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ 4629 #endif 4630 return; 4631 4632 out1: 4633 unregister_filesystem(&shmem_fs_type); 4634 out2: 4635 #ifdef CONFIG_TMPFS_QUOTA 4636 unregister_quota_format(&shmem_quota_format); 4637 out3: 4638 #endif 4639 shmem_destroy_inodecache(); 4640 shm_mnt = ERR_PTR(error); 4641 } 4642 4643 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) 4644 static ssize_t shmem_enabled_show(struct kobject *kobj, 4645 struct kobj_attribute *attr, char *buf) 4646 { 4647 static const int values[] = { 4648 SHMEM_HUGE_ALWAYS, 4649 SHMEM_HUGE_WITHIN_SIZE, 4650 SHMEM_HUGE_ADVISE, 4651 SHMEM_HUGE_NEVER, 4652 SHMEM_HUGE_DENY, 4653 SHMEM_HUGE_FORCE, 4654 }; 4655 int len = 0; 4656 int i; 4657 4658 for (i = 0; i < ARRAY_SIZE(values); i++) { 4659 len += sysfs_emit_at(buf, len, 4660 shmem_huge == values[i] ? "%s[%s]" : "%s%s", 4661 i ? " " : "", shmem_format_huge(values[i])); 4662 } 4663 len += sysfs_emit_at(buf, len, "\n"); 4664 4665 return len; 4666 } 4667 4668 static ssize_t shmem_enabled_store(struct kobject *kobj, 4669 struct kobj_attribute *attr, const char *buf, size_t count) 4670 { 4671 char tmp[16]; 4672 int huge; 4673 4674 if (count + 1 > sizeof(tmp)) 4675 return -EINVAL; 4676 memcpy(tmp, buf, count); 4677 tmp[count] = '\0'; 4678 if (count && tmp[count - 1] == '\n') 4679 tmp[count - 1] = '\0'; 4680 4681 huge = shmem_parse_huge(tmp); 4682 if (huge == -EINVAL) 4683 return -EINVAL; 4684 if (!has_transparent_hugepage() && 4685 huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) 4686 return -EINVAL; 4687 4688 shmem_huge = huge; 4689 if (shmem_huge > SHMEM_HUGE_DENY) 4690 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; 4691 return count; 4692 } 4693 4694 struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); 4695 #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ 4696 4697 #else /* !CONFIG_SHMEM */ 4698 4699 /* 4700 * tiny-shmem: simple shmemfs and tmpfs using ramfs code 4701 * 4702 * This is intended for small system where the benefits of the full 4703 * shmem code (swap-backed and resource-limited) are outweighed by 4704 * their complexity. On systems without swap this code should be 4705 * effectively equivalent, but much lighter weight. 4706 */ 4707 4708 static struct file_system_type shmem_fs_type = { 4709 .name = "tmpfs", 4710 .init_fs_context = ramfs_init_fs_context, 4711 .parameters = ramfs_fs_parameters, 4712 .kill_sb = ramfs_kill_sb, 4713 .fs_flags = FS_USERNS_MOUNT, 4714 }; 4715 4716 void __init shmem_init(void) 4717 { 4718 BUG_ON(register_filesystem(&shmem_fs_type) != 0); 4719 4720 shm_mnt = kern_mount(&shmem_fs_type); 4721 BUG_ON(IS_ERR(shm_mnt)); 4722 } 4723 4724 int shmem_unuse(unsigned int type) 4725 { 4726 return 0; 4727 } 4728 4729 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) 4730 { 4731 return 0; 4732 } 4733 4734 void shmem_unlock_mapping(struct address_space *mapping) 4735 { 4736 } 4737 4738 #ifdef CONFIG_MMU 4739 unsigned long shmem_get_unmapped_area(struct file *file, 4740 unsigned long addr, unsigned long len, 4741 unsigned long pgoff, unsigned long flags) 4742 { 4743 return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); 4744 } 4745 #endif 4746 4747 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 4748 { 4749 truncate_inode_pages_range(inode->i_mapping, lstart, lend); 4750 } 4751 EXPORT_SYMBOL_GPL(shmem_truncate_range); 4752 4753 #define shmem_vm_ops generic_file_vm_ops 4754 #define shmem_anon_vm_ops generic_file_vm_ops 4755 #define shmem_file_operations ramfs_file_operations 4756 #define shmem_acct_size(flags, size) 0 4757 #define shmem_unacct_size(flags, size) do {} while (0) 4758 4759 static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, 4760 struct super_block *sb, struct inode *dir, 4761 umode_t mode, dev_t dev, unsigned long flags) 4762 { 4763 struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); 4764 return inode ? inode : ERR_PTR(-ENOSPC); 4765 } 4766 4767 #endif /* CONFIG_SHMEM */ 4768 4769 /* common code */ 4770 4771 static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, 4772 loff_t size, unsigned long flags, unsigned int i_flags) 4773 { 4774 struct inode *inode; 4775 struct file *res; 4776 4777 if (IS_ERR(mnt)) 4778 return ERR_CAST(mnt); 4779 4780 if (size < 0 || size > MAX_LFS_FILESIZE) 4781 return ERR_PTR(-EINVAL); 4782 4783 if (shmem_acct_size(flags, size)) 4784 return ERR_PTR(-ENOMEM); 4785 4786 if (is_idmapped_mnt(mnt)) 4787 return ERR_PTR(-EINVAL); 4788 4789 inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, 4790 S_IFREG | S_IRWXUGO, 0, flags); 4791 if (IS_ERR(inode)) { 4792 shmem_unacct_size(flags, size); 4793 return ERR_CAST(inode); 4794 } 4795 inode->i_flags |= i_flags; 4796 inode->i_size = size; 4797 clear_nlink(inode); /* It is unlinked */ 4798 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); 4799 if (!IS_ERR(res)) 4800 res = alloc_file_pseudo(inode, mnt, name, O_RDWR, 4801 &shmem_file_operations); 4802 if (IS_ERR(res)) 4803 iput(inode); 4804 return res; 4805 } 4806 4807 /** 4808 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be 4809 * kernel internal. There will be NO LSM permission checks against the 4810 * underlying inode. So users of this interface must do LSM checks at a 4811 * higher layer. The users are the big_key and shm implementations. LSM 4812 * checks are provided at the key or shm level rather than the inode. 4813 * @name: name for dentry (to be seen in /proc/<pid>/maps 4814 * @size: size to be set for the file 4815 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 4816 */ 4817 struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) 4818 { 4819 return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); 4820 } 4821 4822 /** 4823 * shmem_file_setup - get an unlinked file living in tmpfs 4824 * @name: name for dentry (to be seen in /proc/<pid>/maps 4825 * @size: size to be set for the file 4826 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 4827 */ 4828 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 4829 { 4830 return __shmem_file_setup(shm_mnt, name, size, flags, 0); 4831 } 4832 EXPORT_SYMBOL_GPL(shmem_file_setup); 4833 4834 /** 4835 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs 4836 * @mnt: the tmpfs mount where the file will be created 4837 * @name: name for dentry (to be seen in /proc/<pid>/maps 4838 * @size: size to be set for the file 4839 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 4840 */ 4841 struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, 4842 loff_t size, unsigned long flags) 4843 { 4844 return __shmem_file_setup(mnt, name, size, flags, 0); 4845 } 4846 EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); 4847 4848 /** 4849 * shmem_zero_setup - setup a shared anonymous mapping 4850 * @vma: the vma to be mmapped is prepared by do_mmap 4851 */ 4852 int shmem_zero_setup(struct vm_area_struct *vma) 4853 { 4854 struct file *file; 4855 loff_t size = vma->vm_end - vma->vm_start; 4856 4857 /* 4858 * Cloning a new file under mmap_lock leads to a lock ordering conflict 4859 * between XFS directory reading and selinux: since this file is only 4860 * accessible to the user through its mapping, use S_PRIVATE flag to 4861 * bypass file security, in the same way as shmem_kernel_file_setup(). 4862 */ 4863 file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); 4864 if (IS_ERR(file)) 4865 return PTR_ERR(file); 4866 4867 if (vma->vm_file) 4868 fput(vma->vm_file); 4869 vma->vm_file = file; 4870 vma->vm_ops = &shmem_anon_vm_ops; 4871 4872 return 0; 4873 } 4874 4875 /** 4876 * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. 4877 * @mapping: the folio's address_space 4878 * @index: the folio index 4879 * @gfp: the page allocator flags to use if allocating 4880 * 4881 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 4882 * with any new page allocations done using the specified allocation flags. 4883 * But read_cache_page_gfp() uses the ->read_folio() method: which does not 4884 * suit tmpfs, since it may have pages in swapcache, and needs to find those 4885 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 4886 * 4887 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 4888 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 4889 */ 4890 struct folio *shmem_read_folio_gfp(struct address_space *mapping, 4891 pgoff_t index, gfp_t gfp) 4892 { 4893 #ifdef CONFIG_SHMEM 4894 struct inode *inode = mapping->host; 4895 struct folio *folio; 4896 int error; 4897 4898 BUG_ON(!shmem_mapping(mapping)); 4899 error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, 4900 gfp, NULL, NULL); 4901 if (error) 4902 return ERR_PTR(error); 4903 4904 folio_unlock(folio); 4905 return folio; 4906 #else 4907 /* 4908 * The tiny !SHMEM case uses ramfs without swap 4909 */ 4910 return mapping_read_folio_gfp(mapping, index, gfp); 4911 #endif 4912 } 4913 EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); 4914 4915 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 4916 pgoff_t index, gfp_t gfp) 4917 { 4918 struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); 4919 struct page *page; 4920 4921 if (IS_ERR(folio)) 4922 return &folio->page; 4923 4924 page = folio_file_page(folio, index); 4925 if (PageHWPoison(page)) { 4926 folio_put(folio); 4927 return ERR_PTR(-EIO); 4928 } 4929 4930 return page; 4931 } 4932 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 4933