1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/fs/file_table.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) 7 */ 8 9 #include <linux/string.h> 10 #include <linux/slab.h> 11 #include <linux/file.h> 12 #include <linux/init.h> 13 #include <linux/module.h> 14 #include <linux/fs.h> 15 #include <linux/filelock.h> 16 #include <linux/security.h> 17 #include <linux/cred.h> 18 #include <linux/eventpoll.h> 19 #include <linux/rcupdate.h> 20 #include <linux/mount.h> 21 #include <linux/capability.h> 22 #include <linux/cdev.h> 23 #include <linux/fsnotify.h> 24 #include <linux/sysctl.h> 25 #include <linux/percpu_counter.h> 26 #include <linux/percpu.h> 27 #include <linux/task_work.h> 28 #include <linux/swap.h> 29 #include <linux/kmemleak.h> 30 31 #include <linux/atomic.h> 32 33 #include "internal.h" 34 35 /* sysctl tunables... */ 36 static struct files_stat_struct files_stat = { 37 .max_files = NR_FILE 38 }; 39 40 /* SLAB cache for file structures */ 41 static struct kmem_cache *filp_cachep __ro_after_init; 42 43 static struct percpu_counter nr_files __cacheline_aligned_in_smp; 44 45 /* Container for backing file with optional user path */ 46 struct backing_file { 47 struct file file; 48 struct path user_path; 49 }; 50 51 static inline struct backing_file *backing_file(struct file *f) 52 { 53 return container_of(f, struct backing_file, file); 54 } 55 56 struct path *backing_file_user_path(struct file *f) 57 { 58 return &backing_file(f)->user_path; 59 } 60 EXPORT_SYMBOL_GPL(backing_file_user_path); 61 62 static inline void file_free(struct file *f) 63 { 64 security_file_free(f); 65 if (likely(!(f->f_mode & FMODE_NOACCOUNT))) 66 percpu_counter_dec(&nr_files); 67 put_cred(f->f_cred); 68 if (unlikely(f->f_mode & FMODE_BACKING)) { 69 path_put(backing_file_user_path(f)); 70 kfree(backing_file(f)); 71 } else { 72 kmem_cache_free(filp_cachep, f); 73 } 74 } 75 76 /* 77 * Return the total number of open files in the system 78 */ 79 static long get_nr_files(void) 80 { 81 return percpu_counter_read_positive(&nr_files); 82 } 83 84 /* 85 * Return the maximum number of open files in the system 86 */ 87 unsigned long get_max_files(void) 88 { 89 return files_stat.max_files; 90 } 91 EXPORT_SYMBOL_GPL(get_max_files); 92 93 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) 94 95 /* 96 * Handle nr_files sysctl 97 */ 98 static int proc_nr_files(const struct ctl_table *table, int write, void *buffer, 99 size_t *lenp, loff_t *ppos) 100 { 101 files_stat.nr_files = get_nr_files(); 102 return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 103 } 104 105 static struct ctl_table fs_stat_sysctls[] = { 106 { 107 .procname = "file-nr", 108 .data = &files_stat, 109 .maxlen = sizeof(files_stat), 110 .mode = 0444, 111 .proc_handler = proc_nr_files, 112 }, 113 { 114 .procname = "file-max", 115 .data = &files_stat.max_files, 116 .maxlen = sizeof(files_stat.max_files), 117 .mode = 0644, 118 .proc_handler = proc_doulongvec_minmax, 119 .extra1 = SYSCTL_LONG_ZERO, 120 .extra2 = SYSCTL_LONG_MAX, 121 }, 122 { 123 .procname = "nr_open", 124 .data = &sysctl_nr_open, 125 .maxlen = sizeof(unsigned int), 126 .mode = 0644, 127 .proc_handler = proc_dointvec_minmax, 128 .extra1 = &sysctl_nr_open_min, 129 .extra2 = &sysctl_nr_open_max, 130 }, 131 }; 132 133 static int __init init_fs_stat_sysctls(void) 134 { 135 register_sysctl_init("fs", fs_stat_sysctls); 136 if (IS_ENABLED(CONFIG_BINFMT_MISC)) { 137 struct ctl_table_header *hdr; 138 139 hdr = register_sysctl_mount_point("fs/binfmt_misc"); 140 kmemleak_not_leak(hdr); 141 } 142 return 0; 143 } 144 fs_initcall(init_fs_stat_sysctls); 145 #endif 146 147 static int init_file(struct file *f, int flags, const struct cred *cred) 148 { 149 int error; 150 151 f->f_cred = get_cred(cred); 152 error = security_file_alloc(f); 153 if (unlikely(error)) { 154 put_cred(f->f_cred); 155 return error; 156 } 157 158 spin_lock_init(&f->f_lock); 159 /* 160 * Note that f_pos_lock is only used for files raising 161 * FMODE_ATOMIC_POS and directories. Other files such as pipes 162 * don't need it and since f_pos_lock is in a union may reuse 163 * the space for other purposes. They are expected to initialize 164 * the respective member when opening the file. 165 */ 166 mutex_init(&f->f_pos_lock); 167 f->f_flags = flags; 168 f->f_mode = OPEN_FMODE(flags); 169 /* f->f_version: 0 */ 170 171 /* 172 * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While 173 * fget-rcu pattern users need to be able to handle spurious 174 * refcount bumps we should reinitialize the reused file first. 175 */ 176 atomic_long_set(&f->f_count, 1); 177 return 0; 178 } 179 180 /* Find an unused file structure and return a pointer to it. 181 * Returns an error pointer if some error happend e.g. we over file 182 * structures limit, run out of memory or operation is not permitted. 183 * 184 * Be very careful using this. You are responsible for 185 * getting write access to any mount that you might assign 186 * to this filp, if it is opened for write. If this is not 187 * done, you will imbalance int the mount's writer count 188 * and a warning at __fput() time. 189 */ 190 struct file *alloc_empty_file(int flags, const struct cred *cred) 191 { 192 static long old_max; 193 struct file *f; 194 int error; 195 196 /* 197 * Privileged users can go above max_files 198 */ 199 if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { 200 /* 201 * percpu_counters are inaccurate. Do an expensive check before 202 * we go and fail. 203 */ 204 if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) 205 goto over; 206 } 207 208 f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); 209 if (unlikely(!f)) 210 return ERR_PTR(-ENOMEM); 211 212 error = init_file(f, flags, cred); 213 if (unlikely(error)) { 214 kmem_cache_free(filp_cachep, f); 215 return ERR_PTR(error); 216 } 217 218 percpu_counter_inc(&nr_files); 219 220 return f; 221 222 over: 223 /* Ran out of filps - report that */ 224 if (get_nr_files() > old_max) { 225 pr_info("VFS: file-max limit %lu reached\n", get_max_files()); 226 old_max = get_nr_files(); 227 } 228 return ERR_PTR(-ENFILE); 229 } 230 231 /* 232 * Variant of alloc_empty_file() that doesn't check and modify nr_files. 233 * 234 * This is only for kernel internal use, and the allocate file must not be 235 * installed into file tables or such. 236 */ 237 struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) 238 { 239 struct file *f; 240 int error; 241 242 f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); 243 if (unlikely(!f)) 244 return ERR_PTR(-ENOMEM); 245 246 error = init_file(f, flags, cred); 247 if (unlikely(error)) { 248 kmem_cache_free(filp_cachep, f); 249 return ERR_PTR(error); 250 } 251 252 f->f_mode |= FMODE_NOACCOUNT; 253 254 return f; 255 } 256 257 /* 258 * Variant of alloc_empty_file() that allocates a backing_file container 259 * and doesn't check and modify nr_files. 260 * 261 * This is only for kernel internal use, and the allocate file must not be 262 * installed into file tables or such. 263 */ 264 struct file *alloc_empty_backing_file(int flags, const struct cred *cred) 265 { 266 struct backing_file *ff; 267 int error; 268 269 ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL); 270 if (unlikely(!ff)) 271 return ERR_PTR(-ENOMEM); 272 273 error = init_file(&ff->file, flags, cred); 274 if (unlikely(error)) { 275 kfree(ff); 276 return ERR_PTR(error); 277 } 278 279 ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; 280 return &ff->file; 281 } 282 283 /** 284 * file_init_path - initialize a 'struct file' based on path 285 * 286 * @file: the file to set up 287 * @path: the (dentry, vfsmount) pair for the new file 288 * @fop: the 'struct file_operations' for the new file 289 */ 290 static void file_init_path(struct file *file, const struct path *path, 291 const struct file_operations *fop) 292 { 293 file->f_path = *path; 294 file->f_inode = path->dentry->d_inode; 295 file->f_mapping = path->dentry->d_inode->i_mapping; 296 file->f_wb_err = filemap_sample_wb_err(file->f_mapping); 297 file->f_sb_err = file_sample_sb_err(file); 298 if (fop->llseek) 299 file->f_mode |= FMODE_LSEEK; 300 if ((file->f_mode & FMODE_READ) && 301 likely(fop->read || fop->read_iter)) 302 file->f_mode |= FMODE_CAN_READ; 303 if ((file->f_mode & FMODE_WRITE) && 304 likely(fop->write || fop->write_iter)) 305 file->f_mode |= FMODE_CAN_WRITE; 306 file->f_iocb_flags = iocb_flags(file); 307 file->f_mode |= FMODE_OPENED; 308 file->f_op = fop; 309 if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) 310 i_readcount_inc(path->dentry->d_inode); 311 } 312 313 /** 314 * alloc_file - allocate and initialize a 'struct file' 315 * 316 * @path: the (dentry, vfsmount) pair for the new file 317 * @flags: O_... flags with which the new file will be opened 318 * @fop: the 'struct file_operations' for the new file 319 */ 320 static struct file *alloc_file(const struct path *path, int flags, 321 const struct file_operations *fop) 322 { 323 struct file *file; 324 325 file = alloc_empty_file(flags, current_cred()); 326 if (!IS_ERR(file)) 327 file_init_path(file, path, fop); 328 return file; 329 } 330 331 static inline int alloc_path_pseudo(const char *name, struct inode *inode, 332 struct vfsmount *mnt, struct path *path) 333 { 334 struct qstr this = QSTR_INIT(name, strlen(name)); 335 336 path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this); 337 if (!path->dentry) 338 return -ENOMEM; 339 path->mnt = mntget(mnt); 340 d_instantiate(path->dentry, inode); 341 return 0; 342 } 343 344 struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, 345 const char *name, int flags, 346 const struct file_operations *fops) 347 { 348 int ret; 349 struct path path; 350 struct file *file; 351 352 ret = alloc_path_pseudo(name, inode, mnt, &path); 353 if (ret) 354 return ERR_PTR(ret); 355 356 file = alloc_file(&path, flags, fops); 357 if (IS_ERR(file)) { 358 ihold(inode); 359 path_put(&path); 360 } 361 return file; 362 } 363 EXPORT_SYMBOL(alloc_file_pseudo); 364 365 struct file *alloc_file_pseudo_noaccount(struct inode *inode, 366 struct vfsmount *mnt, const char *name, 367 int flags, 368 const struct file_operations *fops) 369 { 370 int ret; 371 struct path path; 372 struct file *file; 373 374 ret = alloc_path_pseudo(name, inode, mnt, &path); 375 if (ret) 376 return ERR_PTR(ret); 377 378 file = alloc_empty_file_noaccount(flags, current_cred()); 379 if (IS_ERR(file)) { 380 ihold(inode); 381 path_put(&path); 382 return file; 383 } 384 file_init_path(file, &path, fops); 385 return file; 386 } 387 EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); 388 389 struct file *alloc_file_clone(struct file *base, int flags, 390 const struct file_operations *fops) 391 { 392 struct file *f; 393 394 f = alloc_file(&base->f_path, flags, fops); 395 if (!IS_ERR(f)) { 396 path_get(&f->f_path); 397 f->f_mapping = base->f_mapping; 398 } 399 return f; 400 } 401 402 /* the real guts of fput() - releasing the last reference to file 403 */ 404 static void __fput(struct file *file) 405 { 406 struct dentry *dentry = file->f_path.dentry; 407 struct vfsmount *mnt = file->f_path.mnt; 408 struct inode *inode = file->f_inode; 409 fmode_t mode = file->f_mode; 410 411 if (unlikely(!(file->f_mode & FMODE_OPENED))) 412 goto out; 413 414 might_sleep(); 415 416 fsnotify_close(file); 417 /* 418 * The function eventpoll_release() should be the first called 419 * in the file cleanup chain. 420 */ 421 eventpoll_release(file); 422 locks_remove_file(file); 423 424 security_file_release(file); 425 if (unlikely(file->f_flags & FASYNC)) { 426 if (file->f_op->fasync) 427 file->f_op->fasync(-1, file, 0); 428 } 429 if (file->f_op->release) 430 file->f_op->release(inode, file); 431 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && 432 !(mode & FMODE_PATH))) { 433 cdev_put(inode->i_cdev); 434 } 435 fops_put(file->f_op); 436 file_f_owner_release(file); 437 put_file_access(file); 438 dput(dentry); 439 if (unlikely(mode & FMODE_NEED_UNMOUNT)) 440 dissolve_on_fput(mnt); 441 mntput(mnt); 442 out: 443 file_free(file); 444 } 445 446 static LLIST_HEAD(delayed_fput_list); 447 static void delayed_fput(struct work_struct *unused) 448 { 449 struct llist_node *node = llist_del_all(&delayed_fput_list); 450 struct file *f, *t; 451 452 llist_for_each_entry_safe(f, t, node, f_llist) 453 __fput(f); 454 } 455 456 static void ____fput(struct callback_head *work) 457 { 458 __fput(container_of(work, struct file, f_task_work)); 459 } 460 461 /* 462 * If kernel thread really needs to have the final fput() it has done 463 * to complete, call this. The only user right now is the boot - we 464 * *do* need to make sure our writes to binaries on initramfs has 465 * not left us with opened struct file waiting for __fput() - execve() 466 * won't work without that. Please, don't add more callers without 467 * very good reasons; in particular, never call that with locks 468 * held and never call that from a thread that might need to do 469 * some work on any kind of umount. 470 */ 471 void flush_delayed_fput(void) 472 { 473 delayed_fput(NULL); 474 } 475 EXPORT_SYMBOL_GPL(flush_delayed_fput); 476 477 static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); 478 479 void fput(struct file *file) 480 { 481 if (atomic_long_dec_and_test(&file->f_count)) { 482 struct task_struct *task = current; 483 484 if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { 485 file_free(file); 486 return; 487 } 488 if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { 489 init_task_work(&file->f_task_work, ____fput); 490 if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) 491 return; 492 /* 493 * After this task has run exit_task_work(), 494 * task_work_add() will fail. Fall through to delayed 495 * fput to avoid leaking *file. 496 */ 497 } 498 499 if (llist_add(&file->f_llist, &delayed_fput_list)) 500 schedule_delayed_work(&delayed_fput_work, 1); 501 } 502 } 503 504 /* 505 * synchronous analog of fput(); for kernel threads that might be needed 506 * in some umount() (and thus can't use flush_delayed_fput() without 507 * risking deadlocks), need to wait for completion of __fput() and know 508 * for this specific struct file it won't involve anything that would 509 * need them. Use only if you really need it - at the very least, 510 * don't blindly convert fput() by kernel thread to that. 511 */ 512 void __fput_sync(struct file *file) 513 { 514 if (atomic_long_dec_and_test(&file->f_count)) 515 __fput(file); 516 } 517 518 EXPORT_SYMBOL(fput); 519 EXPORT_SYMBOL(__fput_sync); 520 521 void __init files_init(void) 522 { 523 struct kmem_cache_args args = { 524 .use_freeptr_offset = true, 525 .freeptr_offset = offsetof(struct file, f_freeptr), 526 }; 527 528 filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, 529 SLAB_HWCACHE_ALIGN | SLAB_PANIC | 530 SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); 531 percpu_counter_init(&nr_files, 0, GFP_KERNEL); 532 } 533 534 /* 535 * One file with associated inode and dcache is very roughly 1K. Per default 536 * do not use more than 10% of our memory for files. 537 */ 538 void __init files_maxfiles_init(void) 539 { 540 unsigned long n; 541 unsigned long nr_pages = totalram_pages(); 542 unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; 543 544 memreserve = min(memreserve, nr_pages - 1); 545 n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; 546 547 files_stat.max_files = max_t(unsigned long, n, NR_FILE); 548 } 549