1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/file.c 4 * 5 * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes 6 * 7 * Manage the dynamic fd arrays in the process files_struct. 8 */ 9 10 #include <linux/syscalls.h> 11 #include <linux/export.h> 12 #include <linux/fs.h> 13 #include <linux/kernel.h> 14 #include <linux/mm.h> 15 #include <linux/sched/signal.h> 16 #include <linux/slab.h> 17 #include <linux/file.h> 18 #include <linux/fdtable.h> 19 #include <linux/bitops.h> 20 #include <linux/spinlock.h> 21 #include <linux/rcupdate.h> 22 #include <linux/close_range.h> 23 #include <net/sock.h> 24 25 #include "internal.h" 26 27 unsigned int sysctl_nr_open __read_mostly = 1024*1024; 28 unsigned int sysctl_nr_open_min = BITS_PER_LONG; 29 /* our min() is unusable in constant expressions ;-/ */ 30 #define __const_min(x, y) ((x) < (y) ? (x) : (y)) 31 unsigned int sysctl_nr_open_max = 32 __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; 33 34 static void __free_fdtable(struct fdtable *fdt) 35 { 36 kvfree(fdt->fd); 37 kvfree(fdt->open_fds); 38 kfree(fdt); 39 } 40 41 static void free_fdtable_rcu(struct rcu_head *rcu) 42 { 43 __free_fdtable(container_of(rcu, struct fdtable, rcu)); 44 } 45 46 #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) 47 #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) 48 49 #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds 50 /* 51 * Copy 'count' fd bits from the old table to the new table and clear the extra 52 * space if any. This does not copy the file pointers. Called with the files 53 * spinlock held for write. 54 */ 55 static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, 56 unsigned int copy_words) 57 { 58 unsigned int nwords = fdt_words(nfdt); 59 60 bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, 61 copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); 62 bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, 63 copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); 64 bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, 65 copy_words, nwords); 66 } 67 68 /* 69 * Copy all file descriptors from the old table to the new, expanded table and 70 * clear the extra space. Called with the files spinlock held for write. 71 */ 72 static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) 73 { 74 size_t cpy, set; 75 76 BUG_ON(nfdt->max_fds < ofdt->max_fds); 77 78 cpy = ofdt->max_fds * sizeof(struct file *); 79 set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); 80 memcpy(nfdt->fd, ofdt->fd, cpy); 81 memset((char *)nfdt->fd + cpy, 0, set); 82 83 copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); 84 } 85 86 /* 87 * Note how the fdtable bitmap allocations very much have to be a multiple of 88 * BITS_PER_LONG. This is not only because we walk those things in chunks of 89 * 'unsigned long' in some places, but simply because that is how the Linux 90 * kernel bitmaps are defined to work: they are not "bits in an array of bytes", 91 * they are very much "bits in an array of unsigned long". 92 * 93 * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied 94 * by that "1024/sizeof(ptr)" before, we already know there are sufficient 95 * clear low bits. Clang seems to realize that, gcc ends up being confused. 96 * 97 * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, 98 * let's consider it documentation (and maybe a test-case for gcc to improve 99 * its code generation ;) 100 */ 101 static struct fdtable * alloc_fdtable(unsigned int nr) 102 { 103 struct fdtable *fdt; 104 void *data; 105 106 /* 107 * Figure out how many fds we actually want to support in this fdtable. 108 * Allocation steps are keyed to the size of the fdarray, since it 109 * grows far faster than any of the other dynamic data. We try to fit 110 * the fdarray into comfortable page-tuned chunks: starting at 1024B 111 * and growing in powers of two from there on. 112 */ 113 nr /= (1024 / sizeof(struct file *)); 114 nr = roundup_pow_of_two(nr + 1); 115 nr *= (1024 / sizeof(struct file *)); 116 nr = ALIGN(nr, BITS_PER_LONG); 117 /* 118 * Note that this can drive nr *below* what we had passed if sysctl_nr_open 119 * had been set lower between the check in expand_files() and here. Deal 120 * with that in caller, it's cheaper that way. 121 * 122 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise 123 * bitmaps handling below becomes unpleasant, to put it mildly... 124 */ 125 if (unlikely(nr > sysctl_nr_open)) 126 nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; 127 128 fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); 129 if (!fdt) 130 goto out; 131 fdt->max_fds = nr; 132 data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); 133 if (!data) 134 goto out_fdt; 135 fdt->fd = data; 136 137 data = kvmalloc(max_t(size_t, 138 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), 139 GFP_KERNEL_ACCOUNT); 140 if (!data) 141 goto out_arr; 142 fdt->open_fds = data; 143 data += nr / BITS_PER_BYTE; 144 fdt->close_on_exec = data; 145 data += nr / BITS_PER_BYTE; 146 fdt->full_fds_bits = data; 147 148 return fdt; 149 150 out_arr: 151 kvfree(fdt->fd); 152 out_fdt: 153 kfree(fdt); 154 out: 155 return NULL; 156 } 157 158 /* 159 * Expand the file descriptor table. 160 * This function will allocate a new fdtable and both fd array and fdset, of 161 * the given size. 162 * Return <0 error code on error; 1 on successful completion. 163 * The files->file_lock should be held on entry, and will be held on exit. 164 */ 165 static int expand_fdtable(struct files_struct *files, unsigned int nr) 166 __releases(files->file_lock) 167 __acquires(files->file_lock) 168 { 169 struct fdtable *new_fdt, *cur_fdt; 170 171 spin_unlock(&files->file_lock); 172 new_fdt = alloc_fdtable(nr); 173 174 /* make sure all fd_install() have seen resize_in_progress 175 * or have finished their rcu_read_lock_sched() section. 176 */ 177 if (atomic_read(&files->count) > 1) 178 synchronize_rcu(); 179 180 spin_lock(&files->file_lock); 181 if (!new_fdt) 182 return -ENOMEM; 183 /* 184 * extremely unlikely race - sysctl_nr_open decreased between the check in 185 * caller and alloc_fdtable(). Cheaper to catch it here... 186 */ 187 if (unlikely(new_fdt->max_fds <= nr)) { 188 __free_fdtable(new_fdt); 189 return -EMFILE; 190 } 191 cur_fdt = files_fdtable(files); 192 BUG_ON(nr < cur_fdt->max_fds); 193 copy_fdtable(new_fdt, cur_fdt); 194 rcu_assign_pointer(files->fdt, new_fdt); 195 if (cur_fdt != &files->fdtab) 196 call_rcu(&cur_fdt->rcu, free_fdtable_rcu); 197 /* coupled with smp_rmb() in fd_install() */ 198 smp_wmb(); 199 return 1; 200 } 201 202 /* 203 * Expand files. 204 * This function will expand the file structures, if the requested size exceeds 205 * the current capacity and there is room for expansion. 206 * Return <0 error code on error; 0 when nothing done; 1 when files were 207 * expanded and execution may have blocked. 208 * The files->file_lock should be held on entry, and will be held on exit. 209 */ 210 static int expand_files(struct files_struct *files, unsigned int nr) 211 __releases(files->file_lock) 212 __acquires(files->file_lock) 213 { 214 struct fdtable *fdt; 215 int expanded = 0; 216 217 repeat: 218 fdt = files_fdtable(files); 219 220 /* Do we need to expand? */ 221 if (nr < fdt->max_fds) 222 return expanded; 223 224 /* Can we expand? */ 225 if (nr >= sysctl_nr_open) 226 return -EMFILE; 227 228 if (unlikely(files->resize_in_progress)) { 229 spin_unlock(&files->file_lock); 230 expanded = 1; 231 wait_event(files->resize_wait, !files->resize_in_progress); 232 spin_lock(&files->file_lock); 233 goto repeat; 234 } 235 236 /* All good, so we try */ 237 files->resize_in_progress = true; 238 expanded = expand_fdtable(files, nr); 239 files->resize_in_progress = false; 240 241 wake_up_all(&files->resize_wait); 242 return expanded; 243 } 244 245 static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) 246 { 247 __set_bit(fd, fdt->close_on_exec); 248 } 249 250 static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) 251 { 252 if (test_bit(fd, fdt->close_on_exec)) 253 __clear_bit(fd, fdt->close_on_exec); 254 } 255 256 static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) 257 { 258 __set_bit(fd, fdt->open_fds); 259 fd /= BITS_PER_LONG; 260 if (!~fdt->open_fds[fd]) 261 __set_bit(fd, fdt->full_fds_bits); 262 } 263 264 static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) 265 { 266 __clear_bit(fd, fdt->open_fds); 267 __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); 268 } 269 270 static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) 271 { 272 return test_bit(fd, fdt->open_fds); 273 } 274 275 static unsigned int count_open_files(struct fdtable *fdt) 276 { 277 unsigned int size = fdt->max_fds; 278 unsigned int i; 279 280 /* Find the last open fd */ 281 for (i = size / BITS_PER_LONG; i > 0; ) { 282 if (fdt->open_fds[--i]) 283 break; 284 } 285 i = (i + 1) * BITS_PER_LONG; 286 return i; 287 } 288 289 /* 290 * Note that a sane fdtable size always has to be a multiple of 291 * BITS_PER_LONG, since we have bitmaps that are sized by this. 292 * 293 * 'max_fds' will normally already be properly aligned, but it 294 * turns out that in the close_range() -> __close_range() -> 295 * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end 296 * up having a 'max_fds' value that isn't already aligned. 297 * 298 * Rather than make close_range() have to worry about this, 299 * just make that BITS_PER_LONG alignment be part of a sane 300 * fdtable size. Becuase that's really what it is. 301 */ 302 static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) 303 { 304 unsigned int count; 305 306 count = count_open_files(fdt); 307 if (max_fds < NR_OPEN_DEFAULT) 308 max_fds = NR_OPEN_DEFAULT; 309 return ALIGN(min(count, max_fds), BITS_PER_LONG); 310 } 311 312 /* 313 * Allocate a new files structure and copy contents from the 314 * passed in files structure. 315 * errorp will be valid only when the returned files_struct is NULL. 316 */ 317 struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp) 318 { 319 struct files_struct *newf; 320 struct file **old_fds, **new_fds; 321 unsigned int open_files, i; 322 struct fdtable *old_fdt, *new_fdt; 323 324 *errorp = -ENOMEM; 325 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); 326 if (!newf) 327 goto out; 328 329 atomic_set(&newf->count, 1); 330 331 spin_lock_init(&newf->file_lock); 332 newf->resize_in_progress = false; 333 init_waitqueue_head(&newf->resize_wait); 334 newf->next_fd = 0; 335 new_fdt = &newf->fdtab; 336 new_fdt->max_fds = NR_OPEN_DEFAULT; 337 new_fdt->close_on_exec = newf->close_on_exec_init; 338 new_fdt->open_fds = newf->open_fds_init; 339 new_fdt->full_fds_bits = newf->full_fds_bits_init; 340 new_fdt->fd = &newf->fd_array[0]; 341 342 spin_lock(&oldf->file_lock); 343 old_fdt = files_fdtable(oldf); 344 open_files = sane_fdtable_size(old_fdt, max_fds); 345 346 /* 347 * Check whether we need to allocate a larger fd array and fd set. 348 */ 349 while (unlikely(open_files > new_fdt->max_fds)) { 350 spin_unlock(&oldf->file_lock); 351 352 if (new_fdt != &newf->fdtab) 353 __free_fdtable(new_fdt); 354 355 new_fdt = alloc_fdtable(open_files - 1); 356 if (!new_fdt) { 357 *errorp = -ENOMEM; 358 goto out_release; 359 } 360 361 /* beyond sysctl_nr_open; nothing to do */ 362 if (unlikely(new_fdt->max_fds < open_files)) { 363 __free_fdtable(new_fdt); 364 *errorp = -EMFILE; 365 goto out_release; 366 } 367 368 /* 369 * Reacquire the oldf lock and a pointer to its fd table 370 * who knows it may have a new bigger fd table. We need 371 * the latest pointer. 372 */ 373 spin_lock(&oldf->file_lock); 374 old_fdt = files_fdtable(oldf); 375 open_files = sane_fdtable_size(old_fdt, max_fds); 376 } 377 378 copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); 379 380 old_fds = old_fdt->fd; 381 new_fds = new_fdt->fd; 382 383 for (i = open_files; i != 0; i--) { 384 struct file *f = *old_fds++; 385 if (f) { 386 get_file(f); 387 } else { 388 /* 389 * The fd may be claimed in the fd bitmap but not yet 390 * instantiated in the files array if a sibling thread 391 * is partway through open(). So make sure that this 392 * fd is available to the new process. 393 */ 394 __clear_open_fd(open_files - i, new_fdt); 395 } 396 rcu_assign_pointer(*new_fds++, f); 397 } 398 spin_unlock(&oldf->file_lock); 399 400 /* clear the remainder */ 401 memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); 402 403 rcu_assign_pointer(newf->fdt, new_fdt); 404 405 return newf; 406 407 out_release: 408 kmem_cache_free(files_cachep, newf); 409 out: 410 return NULL; 411 } 412 413 static struct fdtable *close_files(struct files_struct * files) 414 { 415 /* 416 * It is safe to dereference the fd table without RCU or 417 * ->file_lock because this is the last reference to the 418 * files structure. 419 */ 420 struct fdtable *fdt = rcu_dereference_raw(files->fdt); 421 unsigned int i, j = 0; 422 423 for (;;) { 424 unsigned long set; 425 i = j * BITS_PER_LONG; 426 if (i >= fdt->max_fds) 427 break; 428 set = fdt->open_fds[j++]; 429 while (set) { 430 if (set & 1) { 431 struct file * file = xchg(&fdt->fd[i], NULL); 432 if (file) { 433 filp_close(file, files); 434 cond_resched(); 435 } 436 } 437 i++; 438 set >>= 1; 439 } 440 } 441 442 return fdt; 443 } 444 445 void put_files_struct(struct files_struct *files) 446 { 447 if (atomic_dec_and_test(&files->count)) { 448 struct fdtable *fdt = close_files(files); 449 450 /* free the arrays if they are not embedded */ 451 if (fdt != &files->fdtab) 452 __free_fdtable(fdt); 453 kmem_cache_free(files_cachep, files); 454 } 455 } 456 457 void exit_files(struct task_struct *tsk) 458 { 459 struct files_struct * files = tsk->files; 460 461 if (files) { 462 task_lock(tsk); 463 tsk->files = NULL; 464 task_unlock(tsk); 465 put_files_struct(files); 466 } 467 } 468 469 struct files_struct init_files = { 470 .count = ATOMIC_INIT(1), 471 .fdt = &init_files.fdtab, 472 .fdtab = { 473 .max_fds = NR_OPEN_DEFAULT, 474 .fd = &init_files.fd_array[0], 475 .close_on_exec = init_files.close_on_exec_init, 476 .open_fds = init_files.open_fds_init, 477 .full_fds_bits = init_files.full_fds_bits_init, 478 }, 479 .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), 480 .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), 481 }; 482 483 static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) 484 { 485 unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ 486 unsigned int maxbit = maxfd / BITS_PER_LONG; 487 unsigned int bitbit = start / BITS_PER_LONG; 488 489 bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; 490 if (bitbit >= maxfd) 491 return maxfd; 492 if (bitbit > start) 493 start = bitbit; 494 return find_next_zero_bit(fdt->open_fds, maxfd, start); 495 } 496 497 /* 498 * allocate a file descriptor, mark it busy. 499 */ 500 static int alloc_fd(unsigned start, unsigned end, unsigned flags) 501 { 502 struct files_struct *files = current->files; 503 unsigned int fd; 504 int error; 505 struct fdtable *fdt; 506 507 spin_lock(&files->file_lock); 508 repeat: 509 fdt = files_fdtable(files); 510 fd = start; 511 if (fd < files->next_fd) 512 fd = files->next_fd; 513 514 if (fd < fdt->max_fds) 515 fd = find_next_fd(fdt, fd); 516 517 /* 518 * N.B. For clone tasks sharing a files structure, this test 519 * will limit the total number of files that can be opened. 520 */ 521 error = -EMFILE; 522 if (fd >= end) 523 goto out; 524 525 error = expand_files(files, fd); 526 if (error < 0) 527 goto out; 528 529 /* 530 * If we needed to expand the fs array we 531 * might have blocked - try again. 532 */ 533 if (error) 534 goto repeat; 535 536 if (start <= files->next_fd) 537 files->next_fd = fd + 1; 538 539 __set_open_fd(fd, fdt); 540 if (flags & O_CLOEXEC) 541 __set_close_on_exec(fd, fdt); 542 else 543 __clear_close_on_exec(fd, fdt); 544 error = fd; 545 #if 1 546 /* Sanity check */ 547 if (rcu_access_pointer(fdt->fd[fd]) != NULL) { 548 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); 549 rcu_assign_pointer(fdt->fd[fd], NULL); 550 } 551 #endif 552 553 out: 554 spin_unlock(&files->file_lock); 555 return error; 556 } 557 558 int __get_unused_fd_flags(unsigned flags, unsigned long nofile) 559 { 560 return alloc_fd(0, nofile, flags); 561 } 562 563 int get_unused_fd_flags(unsigned flags) 564 { 565 return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); 566 } 567 EXPORT_SYMBOL(get_unused_fd_flags); 568 569 static void __put_unused_fd(struct files_struct *files, unsigned int fd) 570 { 571 struct fdtable *fdt = files_fdtable(files); 572 __clear_open_fd(fd, fdt); 573 if (fd < files->next_fd) 574 files->next_fd = fd; 575 } 576 577 void put_unused_fd(unsigned int fd) 578 { 579 struct files_struct *files = current->files; 580 spin_lock(&files->file_lock); 581 __put_unused_fd(files, fd); 582 spin_unlock(&files->file_lock); 583 } 584 585 EXPORT_SYMBOL(put_unused_fd); 586 587 /* 588 * Install a file pointer in the fd array. 589 * 590 * The VFS is full of places where we drop the files lock between 591 * setting the open_fds bitmap and installing the file in the file 592 * array. At any such point, we are vulnerable to a dup2() race 593 * installing a file in the array before us. We need to detect this and 594 * fput() the struct file we are about to overwrite in this case. 595 * 596 * It should never happen - if we allow dup2() do it, _really_ bad things 597 * will follow. 598 * 599 * This consumes the "file" refcount, so callers should treat it 600 * as if they had called fput(file). 601 */ 602 603 void fd_install(unsigned int fd, struct file *file) 604 { 605 struct files_struct *files = current->files; 606 struct fdtable *fdt; 607 608 if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) 609 return; 610 611 rcu_read_lock_sched(); 612 613 if (unlikely(files->resize_in_progress)) { 614 rcu_read_unlock_sched(); 615 spin_lock(&files->file_lock); 616 fdt = files_fdtable(files); 617 BUG_ON(fdt->fd[fd] != NULL); 618 rcu_assign_pointer(fdt->fd[fd], file); 619 spin_unlock(&files->file_lock); 620 return; 621 } 622 /* coupled with smp_wmb() in expand_fdtable() */ 623 smp_rmb(); 624 fdt = rcu_dereference_sched(files->fdt); 625 BUG_ON(fdt->fd[fd] != NULL); 626 rcu_assign_pointer(fdt->fd[fd], file); 627 rcu_read_unlock_sched(); 628 } 629 630 EXPORT_SYMBOL(fd_install); 631 632 /** 633 * file_close_fd_locked - return file associated with fd 634 * @files: file struct to retrieve file from 635 * @fd: file descriptor to retrieve file for 636 * 637 * Doesn't take a separate reference count. 638 * 639 * Context: files_lock must be held. 640 * 641 * Returns: The file associated with @fd (NULL if @fd is not open) 642 */ 643 struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) 644 { 645 struct fdtable *fdt = files_fdtable(files); 646 struct file *file; 647 648 lockdep_assert_held(&files->file_lock); 649 650 if (fd >= fdt->max_fds) 651 return NULL; 652 653 fd = array_index_nospec(fd, fdt->max_fds); 654 file = fdt->fd[fd]; 655 if (file) { 656 rcu_assign_pointer(fdt->fd[fd], NULL); 657 __put_unused_fd(files, fd); 658 } 659 return file; 660 } 661 662 int close_fd(unsigned fd) 663 { 664 struct files_struct *files = current->files; 665 struct file *file; 666 667 spin_lock(&files->file_lock); 668 file = file_close_fd_locked(files, fd); 669 spin_unlock(&files->file_lock); 670 if (!file) 671 return -EBADF; 672 673 return filp_close(file, files); 674 } 675 EXPORT_SYMBOL(close_fd); 676 677 /** 678 * last_fd - return last valid index into fd table 679 * @fdt: File descriptor table. 680 * 681 * Context: Either rcu read lock or files_lock must be held. 682 * 683 * Returns: Last valid index into fdtable. 684 */ 685 static inline unsigned last_fd(struct fdtable *fdt) 686 { 687 return fdt->max_fds - 1; 688 } 689 690 static inline void __range_cloexec(struct files_struct *cur_fds, 691 unsigned int fd, unsigned int max_fd) 692 { 693 struct fdtable *fdt; 694 695 /* make sure we're using the correct maximum value */ 696 spin_lock(&cur_fds->file_lock); 697 fdt = files_fdtable(cur_fds); 698 max_fd = min(last_fd(fdt), max_fd); 699 if (fd <= max_fd) 700 bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); 701 spin_unlock(&cur_fds->file_lock); 702 } 703 704 static inline void __range_close(struct files_struct *files, unsigned int fd, 705 unsigned int max_fd) 706 { 707 struct file *file; 708 unsigned n; 709 710 spin_lock(&files->file_lock); 711 n = last_fd(files_fdtable(files)); 712 max_fd = min(max_fd, n); 713 714 for (; fd <= max_fd; fd++) { 715 file = file_close_fd_locked(files, fd); 716 if (file) { 717 spin_unlock(&files->file_lock); 718 filp_close(file, files); 719 cond_resched(); 720 spin_lock(&files->file_lock); 721 } else if (need_resched()) { 722 spin_unlock(&files->file_lock); 723 cond_resched(); 724 spin_lock(&files->file_lock); 725 } 726 } 727 spin_unlock(&files->file_lock); 728 } 729 730 /** 731 * __close_range() - Close all file descriptors in a given range. 732 * 733 * @fd: starting file descriptor to close 734 * @max_fd: last file descriptor to close 735 * @flags: CLOSE_RANGE flags. 736 * 737 * This closes a range of file descriptors. All file descriptors 738 * from @fd up to and including @max_fd are closed. 739 */ 740 int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) 741 { 742 struct task_struct *me = current; 743 struct files_struct *cur_fds = me->files, *fds = NULL; 744 745 if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) 746 return -EINVAL; 747 748 if (fd > max_fd) 749 return -EINVAL; 750 751 if (flags & CLOSE_RANGE_UNSHARE) { 752 int ret; 753 unsigned int max_unshare_fds = NR_OPEN_MAX; 754 755 /* 756 * If the caller requested all fds to be made cloexec we always 757 * copy all of the file descriptors since they still want to 758 * use them. 759 */ 760 if (!(flags & CLOSE_RANGE_CLOEXEC)) { 761 /* 762 * If the requested range is greater than the current 763 * maximum, we're closing everything so only copy all 764 * file descriptors beneath the lowest file descriptor. 765 */ 766 rcu_read_lock(); 767 if (max_fd >= last_fd(files_fdtable(cur_fds))) 768 max_unshare_fds = fd; 769 rcu_read_unlock(); 770 } 771 772 ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); 773 if (ret) 774 return ret; 775 776 /* 777 * We used to share our file descriptor table, and have now 778 * created a private one, make sure we're using it below. 779 */ 780 if (fds) 781 swap(cur_fds, fds); 782 } 783 784 if (flags & CLOSE_RANGE_CLOEXEC) 785 __range_cloexec(cur_fds, fd, max_fd); 786 else 787 __range_close(cur_fds, fd, max_fd); 788 789 if (fds) { 790 /* 791 * We're done closing the files we were supposed to. Time to install 792 * the new file descriptor table and drop the old one. 793 */ 794 task_lock(me); 795 me->files = cur_fds; 796 task_unlock(me); 797 put_files_struct(fds); 798 } 799 800 return 0; 801 } 802 803 /** 804 * file_close_fd - return file associated with fd 805 * @fd: file descriptor to retrieve file for 806 * 807 * Doesn't take a separate reference count. 808 * 809 * Returns: The file associated with @fd (NULL if @fd is not open) 810 */ 811 struct file *file_close_fd(unsigned int fd) 812 { 813 struct files_struct *files = current->files; 814 struct file *file; 815 816 spin_lock(&files->file_lock); 817 file = file_close_fd_locked(files, fd); 818 spin_unlock(&files->file_lock); 819 820 return file; 821 } 822 823 void do_close_on_exec(struct files_struct *files) 824 { 825 unsigned i; 826 struct fdtable *fdt; 827 828 /* exec unshares first */ 829 spin_lock(&files->file_lock); 830 for (i = 0; ; i++) { 831 unsigned long set; 832 unsigned fd = i * BITS_PER_LONG; 833 fdt = files_fdtable(files); 834 if (fd >= fdt->max_fds) 835 break; 836 set = fdt->close_on_exec[i]; 837 if (!set) 838 continue; 839 fdt->close_on_exec[i] = 0; 840 for ( ; set ; fd++, set >>= 1) { 841 struct file *file; 842 if (!(set & 1)) 843 continue; 844 file = fdt->fd[fd]; 845 if (!file) 846 continue; 847 rcu_assign_pointer(fdt->fd[fd], NULL); 848 __put_unused_fd(files, fd); 849 spin_unlock(&files->file_lock); 850 filp_close(file, files); 851 cond_resched(); 852 spin_lock(&files->file_lock); 853 } 854 855 } 856 spin_unlock(&files->file_lock); 857 } 858 859 static struct file *__get_file_rcu(struct file __rcu **f) 860 { 861 struct file __rcu *file; 862 struct file __rcu *file_reloaded; 863 struct file __rcu *file_reloaded_cmp; 864 865 file = rcu_dereference_raw(*f); 866 if (!file) 867 return NULL; 868 869 if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) 870 return ERR_PTR(-EAGAIN); 871 872 file_reloaded = rcu_dereference_raw(*f); 873 874 /* 875 * Ensure that all accesses have a dependency on the load from 876 * rcu_dereference_raw() above so we get correct ordering 877 * between reuse/allocation and the pointer check below. 878 */ 879 file_reloaded_cmp = file_reloaded; 880 OPTIMIZER_HIDE_VAR(file_reloaded_cmp); 881 882 /* 883 * atomic_long_inc_not_zero() above provided a full memory 884 * barrier when we acquired a reference. 885 * 886 * This is paired with the write barrier from assigning to the 887 * __rcu protected file pointer so that if that pointer still 888 * matches the current file, we know we have successfully 889 * acquired a reference to the right file. 890 * 891 * If the pointers don't match the file has been reallocated by 892 * SLAB_TYPESAFE_BY_RCU. 893 */ 894 if (file == file_reloaded_cmp) 895 return file_reloaded; 896 897 fput(file); 898 return ERR_PTR(-EAGAIN); 899 } 900 901 /** 902 * get_file_rcu - try go get a reference to a file under rcu 903 * @f: the file to get a reference on 904 * 905 * This function tries to get a reference on @f carefully verifying that 906 * @f hasn't been reused. 907 * 908 * This function should rarely have to be used and only by users who 909 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. 910 * 911 * Return: Returns @f with the reference count increased or NULL. 912 */ 913 struct file *get_file_rcu(struct file __rcu **f) 914 { 915 for (;;) { 916 struct file __rcu *file; 917 918 file = __get_file_rcu(f); 919 if (!IS_ERR(file)) 920 return file; 921 } 922 } 923 EXPORT_SYMBOL_GPL(get_file_rcu); 924 925 /** 926 * get_file_active - try go get a reference to a file 927 * @f: the file to get a reference on 928 * 929 * In contast to get_file_rcu() the pointer itself isn't part of the 930 * reference counting. 931 * 932 * This function should rarely have to be used and only by users who 933 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. 934 * 935 * Return: Returns @f with the reference count increased or NULL. 936 */ 937 struct file *get_file_active(struct file **f) 938 { 939 struct file __rcu *file; 940 941 rcu_read_lock(); 942 file = __get_file_rcu(f); 943 rcu_read_unlock(); 944 if (IS_ERR(file)) 945 file = NULL; 946 return file; 947 } 948 EXPORT_SYMBOL_GPL(get_file_active); 949 950 static inline struct file *__fget_files_rcu(struct files_struct *files, 951 unsigned int fd, fmode_t mask) 952 { 953 for (;;) { 954 struct file *file; 955 struct fdtable *fdt = rcu_dereference_raw(files->fdt); 956 struct file __rcu **fdentry; 957 unsigned long nospec_mask; 958 959 /* Mask is a 0 for invalid fd's, ~0 for valid ones */ 960 nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); 961 962 /* 963 * fdentry points to the 'fd' offset, or fdt->fd[0]. 964 * Loading from fdt->fd[0] is always safe, because the 965 * array always exists. 966 */ 967 fdentry = fdt->fd + (fd & nospec_mask); 968 969 /* Do the load, then mask any invalid result */ 970 file = rcu_dereference_raw(*fdentry); 971 file = (void *)(nospec_mask & (unsigned long)file); 972 if (unlikely(!file)) 973 return NULL; 974 975 /* 976 * Ok, we have a file pointer that was valid at 977 * some point, but it might have become stale since. 978 * 979 * We need to confirm it by incrementing the refcount 980 * and then check the lookup again. 981 * 982 * atomic_long_inc_not_zero() gives us a full memory 983 * barrier. We only really need an 'acquire' one to 984 * protect the loads below, but we don't have that. 985 */ 986 if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) 987 continue; 988 989 /* 990 * Such a race can take two forms: 991 * 992 * (a) the file ref already went down to zero and the 993 * file hasn't been reused yet or the file count 994 * isn't zero but the file has already been reused. 995 * 996 * (b) the file table entry has changed under us. 997 * Note that we don't need to re-check the 'fdt->fd' 998 * pointer having changed, because it always goes 999 * hand-in-hand with 'fdt'. 1000 * 1001 * If so, we need to put our ref and try again. 1002 */ 1003 if (unlikely(file != rcu_dereference_raw(*fdentry)) || 1004 unlikely(rcu_dereference_raw(files->fdt) != fdt)) { 1005 fput(file); 1006 continue; 1007 } 1008 1009 /* 1010 * This isn't the file we're looking for or we're not 1011 * allowed to get a reference to it. 1012 */ 1013 if (unlikely(file->f_mode & mask)) { 1014 fput(file); 1015 return NULL; 1016 } 1017 1018 /* 1019 * Ok, we have a ref to the file, and checked that it 1020 * still exists. 1021 */ 1022 return file; 1023 } 1024 } 1025 1026 static struct file *__fget_files(struct files_struct *files, unsigned int fd, 1027 fmode_t mask) 1028 { 1029 struct file *file; 1030 1031 rcu_read_lock(); 1032 file = __fget_files_rcu(files, fd, mask); 1033 rcu_read_unlock(); 1034 1035 return file; 1036 } 1037 1038 static inline struct file *__fget(unsigned int fd, fmode_t mask) 1039 { 1040 return __fget_files(current->files, fd, mask); 1041 } 1042 1043 struct file *fget(unsigned int fd) 1044 { 1045 return __fget(fd, FMODE_PATH); 1046 } 1047 EXPORT_SYMBOL(fget); 1048 1049 struct file *fget_raw(unsigned int fd) 1050 { 1051 return __fget(fd, 0); 1052 } 1053 EXPORT_SYMBOL(fget_raw); 1054 1055 struct file *fget_task(struct task_struct *task, unsigned int fd) 1056 { 1057 struct file *file = NULL; 1058 1059 task_lock(task); 1060 if (task->files) 1061 file = __fget_files(task->files, fd, 0); 1062 task_unlock(task); 1063 1064 return file; 1065 } 1066 1067 struct file *lookup_fdget_rcu(unsigned int fd) 1068 { 1069 return __fget_files_rcu(current->files, fd, 0); 1070 1071 } 1072 EXPORT_SYMBOL_GPL(lookup_fdget_rcu); 1073 1074 struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd) 1075 { 1076 /* Must be called with rcu_read_lock held */ 1077 struct files_struct *files; 1078 struct file *file = NULL; 1079 1080 task_lock(task); 1081 files = task->files; 1082 if (files) 1083 file = __fget_files_rcu(files, fd, 0); 1084 task_unlock(task); 1085 1086 return file; 1087 } 1088 1089 struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd) 1090 { 1091 /* Must be called with rcu_read_lock held */ 1092 struct files_struct *files; 1093 unsigned int fd = *ret_fd; 1094 struct file *file = NULL; 1095 1096 task_lock(task); 1097 files = task->files; 1098 if (files) { 1099 for (; fd < files_fdtable(files)->max_fds; fd++) { 1100 file = __fget_files_rcu(files, fd, 0); 1101 if (file) 1102 break; 1103 } 1104 } 1105 task_unlock(task); 1106 *ret_fd = fd; 1107 return file; 1108 } 1109 EXPORT_SYMBOL(task_lookup_next_fdget_rcu); 1110 1111 /* 1112 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 1113 * 1114 * You can use this instead of fget if you satisfy all of the following 1115 * conditions: 1116 * 1) You must call fput_light before exiting the syscall and returning control 1117 * to userspace (i.e. you cannot remember the returned struct file * after 1118 * returning to userspace). 1119 * 2) You must not call filp_close on the returned struct file * in between 1120 * calls to fget_light and fput_light. 1121 * 3) You must not clone the current task in between the calls to fget_light 1122 * and fput_light. 1123 * 1124 * The fput_needed flag returned by fget_light should be passed to the 1125 * corresponding fput_light. 1126 */ 1127 static inline struct fd __fget_light(unsigned int fd, fmode_t mask) 1128 { 1129 struct files_struct *files = current->files; 1130 struct file *file; 1131 1132 /* 1133 * If another thread is concurrently calling close_fd() followed 1134 * by put_files_struct(), we must not observe the old table 1135 * entry combined with the new refcount - otherwise we could 1136 * return a file that is concurrently being freed. 1137 * 1138 * atomic_read_acquire() pairs with atomic_dec_and_test() in 1139 * put_files_struct(). 1140 */ 1141 if (likely(atomic_read_acquire(&files->count) == 1)) { 1142 file = files_lookup_fd_raw(files, fd); 1143 if (!file || unlikely(file->f_mode & mask)) 1144 return EMPTY_FD; 1145 return BORROWED_FD(file); 1146 } else { 1147 file = __fget_files(files, fd, mask); 1148 if (!file) 1149 return EMPTY_FD; 1150 return CLONED_FD(file); 1151 } 1152 } 1153 struct fd fdget(unsigned int fd) 1154 { 1155 return __fget_light(fd, FMODE_PATH); 1156 } 1157 EXPORT_SYMBOL(fdget); 1158 1159 struct fd fdget_raw(unsigned int fd) 1160 { 1161 return __fget_light(fd, 0); 1162 } 1163 1164 /* 1165 * Try to avoid f_pos locking. We only need it if the 1166 * file is marked for FMODE_ATOMIC_POS, and it can be 1167 * accessed multiple ways. 1168 * 1169 * Always do it for directories, because pidfd_getfd() 1170 * can make a file accessible even if it otherwise would 1171 * not be, and for directories this is a correctness 1172 * issue, not a "POSIX requirement". 1173 */ 1174 static inline bool file_needs_f_pos_lock(struct file *file) 1175 { 1176 return (file->f_mode & FMODE_ATOMIC_POS) && 1177 (file_count(file) > 1 || file->f_op->iterate_shared); 1178 } 1179 1180 struct fd fdget_pos(unsigned int fd) 1181 { 1182 struct fd f = fdget(fd); 1183 struct file *file = fd_file(f); 1184 1185 if (file && file_needs_f_pos_lock(file)) { 1186 f.word |= FDPUT_POS_UNLOCK; 1187 mutex_lock(&file->f_pos_lock); 1188 } 1189 return f; 1190 } 1191 1192 void __f_unlock_pos(struct file *f) 1193 { 1194 mutex_unlock(&f->f_pos_lock); 1195 } 1196 1197 /* 1198 * We only lock f_pos if we have threads or if the file might be 1199 * shared with another process. In both cases we'll have an elevated 1200 * file count (done either by fdget() or by fork()). 1201 */ 1202 1203 void set_close_on_exec(unsigned int fd, int flag) 1204 { 1205 struct files_struct *files = current->files; 1206 struct fdtable *fdt; 1207 spin_lock(&files->file_lock); 1208 fdt = files_fdtable(files); 1209 if (flag) 1210 __set_close_on_exec(fd, fdt); 1211 else 1212 __clear_close_on_exec(fd, fdt); 1213 spin_unlock(&files->file_lock); 1214 } 1215 1216 bool get_close_on_exec(unsigned int fd) 1217 { 1218 bool res; 1219 rcu_read_lock(); 1220 res = close_on_exec(fd, current->files); 1221 rcu_read_unlock(); 1222 return res; 1223 } 1224 1225 static int do_dup2(struct files_struct *files, 1226 struct file *file, unsigned fd, unsigned flags) 1227 __releases(&files->file_lock) 1228 { 1229 struct file *tofree; 1230 struct fdtable *fdt; 1231 1232 /* 1233 * We need to detect attempts to do dup2() over allocated but still 1234 * not finished descriptor. NB: OpenBSD avoids that at the price of 1235 * extra work in their equivalent of fget() - they insert struct 1236 * file immediately after grabbing descriptor, mark it larval if 1237 * more work (e.g. actual opening) is needed and make sure that 1238 * fget() treats larval files as absent. Potentially interesting, 1239 * but while extra work in fget() is trivial, locking implications 1240 * and amount of surgery on open()-related paths in VFS are not. 1241 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" 1242 * deadlocks in rather amusing ways, AFAICS. All of that is out of 1243 * scope of POSIX or SUS, since neither considers shared descriptor 1244 * tables and this condition does not arise without those. 1245 */ 1246 fdt = files_fdtable(files); 1247 fd = array_index_nospec(fd, fdt->max_fds); 1248 tofree = fdt->fd[fd]; 1249 if (!tofree && fd_is_open(fd, fdt)) 1250 goto Ebusy; 1251 get_file(file); 1252 rcu_assign_pointer(fdt->fd[fd], file); 1253 __set_open_fd(fd, fdt); 1254 if (flags & O_CLOEXEC) 1255 __set_close_on_exec(fd, fdt); 1256 else 1257 __clear_close_on_exec(fd, fdt); 1258 spin_unlock(&files->file_lock); 1259 1260 if (tofree) 1261 filp_close(tofree, files); 1262 1263 return fd; 1264 1265 Ebusy: 1266 spin_unlock(&files->file_lock); 1267 return -EBUSY; 1268 } 1269 1270 int replace_fd(unsigned fd, struct file *file, unsigned flags) 1271 { 1272 int err; 1273 struct files_struct *files = current->files; 1274 1275 if (!file) 1276 return close_fd(fd); 1277 1278 if (fd >= rlimit(RLIMIT_NOFILE)) 1279 return -EBADF; 1280 1281 spin_lock(&files->file_lock); 1282 err = expand_files(files, fd); 1283 if (unlikely(err < 0)) 1284 goto out_unlock; 1285 return do_dup2(files, file, fd, flags); 1286 1287 out_unlock: 1288 spin_unlock(&files->file_lock); 1289 return err; 1290 } 1291 1292 /** 1293 * receive_fd() - Install received file into file descriptor table 1294 * @file: struct file that was received from another process 1295 * @ufd: __user pointer to write new fd number to 1296 * @o_flags: the O_* flags to apply to the new fd entry 1297 * 1298 * Installs a received file into the file descriptor table, with appropriate 1299 * checks and count updates. Optionally writes the fd number to userspace, if 1300 * @ufd is non-NULL. 1301 * 1302 * This helper handles its own reference counting of the incoming 1303 * struct file. 1304 * 1305 * Returns newly install fd or -ve on error. 1306 */ 1307 int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) 1308 { 1309 int new_fd; 1310 int error; 1311 1312 error = security_file_receive(file); 1313 if (error) 1314 return error; 1315 1316 new_fd = get_unused_fd_flags(o_flags); 1317 if (new_fd < 0) 1318 return new_fd; 1319 1320 if (ufd) { 1321 error = put_user(new_fd, ufd); 1322 if (error) { 1323 put_unused_fd(new_fd); 1324 return error; 1325 } 1326 } 1327 1328 fd_install(new_fd, get_file(file)); 1329 __receive_sock(file); 1330 return new_fd; 1331 } 1332 EXPORT_SYMBOL_GPL(receive_fd); 1333 1334 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) 1335 { 1336 int error; 1337 1338 error = security_file_receive(file); 1339 if (error) 1340 return error; 1341 error = replace_fd(new_fd, file, o_flags); 1342 if (error) 1343 return error; 1344 __receive_sock(file); 1345 return new_fd; 1346 } 1347 1348 static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) 1349 { 1350 int err = -EBADF; 1351 struct file *file; 1352 struct files_struct *files = current->files; 1353 1354 if ((flags & ~O_CLOEXEC) != 0) 1355 return -EINVAL; 1356 1357 if (unlikely(oldfd == newfd)) 1358 return -EINVAL; 1359 1360 if (newfd >= rlimit(RLIMIT_NOFILE)) 1361 return -EBADF; 1362 1363 spin_lock(&files->file_lock); 1364 err = expand_files(files, newfd); 1365 file = files_lookup_fd_locked(files, oldfd); 1366 if (unlikely(!file)) 1367 goto Ebadf; 1368 if (unlikely(err < 0)) { 1369 if (err == -EMFILE) 1370 goto Ebadf; 1371 goto out_unlock; 1372 } 1373 return do_dup2(files, file, newfd, flags); 1374 1375 Ebadf: 1376 err = -EBADF; 1377 out_unlock: 1378 spin_unlock(&files->file_lock); 1379 return err; 1380 } 1381 1382 SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) 1383 { 1384 return ksys_dup3(oldfd, newfd, flags); 1385 } 1386 1387 SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) 1388 { 1389 if (unlikely(newfd == oldfd)) { /* corner case */ 1390 struct files_struct *files = current->files; 1391 struct file *f; 1392 int retval = oldfd; 1393 1394 rcu_read_lock(); 1395 f = __fget_files_rcu(files, oldfd, 0); 1396 if (!f) 1397 retval = -EBADF; 1398 rcu_read_unlock(); 1399 if (f) 1400 fput(f); 1401 return retval; 1402 } 1403 return ksys_dup3(oldfd, newfd, 0); 1404 } 1405 1406 SYSCALL_DEFINE1(dup, unsigned int, fildes) 1407 { 1408 int ret = -EBADF; 1409 struct file *file = fget_raw(fildes); 1410 1411 if (file) { 1412 ret = get_unused_fd_flags(0); 1413 if (ret >= 0) 1414 fd_install(ret, file); 1415 else 1416 fput(file); 1417 } 1418 return ret; 1419 } 1420 1421 int f_dupfd(unsigned int from, struct file *file, unsigned flags) 1422 { 1423 unsigned long nofile = rlimit(RLIMIT_NOFILE); 1424 int err; 1425 if (from >= nofile) 1426 return -EINVAL; 1427 err = alloc_fd(from, nofile, flags); 1428 if (err >= 0) { 1429 get_file(file); 1430 fd_install(err, file); 1431 } 1432 return err; 1433 } 1434 1435 int iterate_fd(struct files_struct *files, unsigned n, 1436 int (*f)(const void *, struct file *, unsigned), 1437 const void *p) 1438 { 1439 struct fdtable *fdt; 1440 int res = 0; 1441 if (!files) 1442 return 0; 1443 spin_lock(&files->file_lock); 1444 for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { 1445 struct file *file; 1446 file = rcu_dereference_check_fdtable(files, fdt->fd[n]); 1447 if (!file) 1448 continue; 1449 res = f(p, file, n); 1450 if (res) 1451 break; 1452 } 1453 spin_unlock(&files->file_lock); 1454 return res; 1455 } 1456 EXPORT_SYMBOL(iterate_fd); 1457