1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/file.c 4 * 5 * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes 6 * 7 * Manage the dynamic fd arrays in the process files_struct. 8 */ 9 10 #include <linux/syscalls.h> 11 #include <linux/export.h> 12 #include <linux/fs.h> 13 #include <linux/kernel.h> 14 #include <linux/mm.h> 15 #include <linux/sched/signal.h> 16 #include <linux/slab.h> 17 #include <linux/file.h> 18 #include <linux/fdtable.h> 19 #include <linux/bitops.h> 20 #include <linux/spinlock.h> 21 #include <linux/rcupdate.h> 22 23 unsigned int sysctl_nr_open __read_mostly = 1024*1024; 24 unsigned int sysctl_nr_open_min = BITS_PER_LONG; 25 /* our min() is unusable in constant expressions ;-/ */ 26 #define __const_min(x, y) ((x) < (y) ? (x) : (y)) 27 unsigned int sysctl_nr_open_max = 28 __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; 29 30 static void __free_fdtable(struct fdtable *fdt) 31 { 32 kvfree(fdt->fd); 33 kvfree(fdt->open_fds); 34 kfree(fdt); 35 } 36 37 static void free_fdtable_rcu(struct rcu_head *rcu) 38 { 39 __free_fdtable(container_of(rcu, struct fdtable, rcu)); 40 } 41 42 #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) 43 #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) 44 45 /* 46 * Copy 'count' fd bits from the old table to the new table and clear the extra 47 * space if any. This does not copy the file pointers. Called with the files 48 * spinlock held for write. 49 */ 50 static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, 51 unsigned int count) 52 { 53 unsigned int cpy, set; 54 55 cpy = count / BITS_PER_BYTE; 56 set = (nfdt->max_fds - count) / BITS_PER_BYTE; 57 memcpy(nfdt->open_fds, ofdt->open_fds, cpy); 58 memset((char *)nfdt->open_fds + cpy, 0, set); 59 memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); 60 memset((char *)nfdt->close_on_exec + cpy, 0, set); 61 62 cpy = BITBIT_SIZE(count); 63 set = BITBIT_SIZE(nfdt->max_fds) - cpy; 64 memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy); 65 memset((char *)nfdt->full_fds_bits + cpy, 0, set); 66 } 67 68 /* 69 * Copy all file descriptors from the old table to the new, expanded table and 70 * clear the extra space. Called with the files spinlock held for write. 71 */ 72 static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) 73 { 74 size_t cpy, set; 75 76 BUG_ON(nfdt->max_fds < ofdt->max_fds); 77 78 cpy = ofdt->max_fds * sizeof(struct file *); 79 set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); 80 memcpy(nfdt->fd, ofdt->fd, cpy); 81 memset((char *)nfdt->fd + cpy, 0, set); 82 83 copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); 84 } 85 86 static struct fdtable * alloc_fdtable(unsigned int nr) 87 { 88 struct fdtable *fdt; 89 void *data; 90 91 /* 92 * Figure out how many fds we actually want to support in this fdtable. 93 * Allocation steps are keyed to the size of the fdarray, since it 94 * grows far faster than any of the other dynamic data. We try to fit 95 * the fdarray into comfortable page-tuned chunks: starting at 1024B 96 * and growing in powers of two from there on. 97 */ 98 nr /= (1024 / sizeof(struct file *)); 99 nr = roundup_pow_of_two(nr + 1); 100 nr *= (1024 / sizeof(struct file *)); 101 /* 102 * Note that this can drive nr *below* what we had passed if sysctl_nr_open 103 * had been set lower between the check in expand_files() and here. Deal 104 * with that in caller, it's cheaper that way. 105 * 106 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise 107 * bitmaps handling below becomes unpleasant, to put it mildly... 108 */ 109 if (unlikely(nr > sysctl_nr_open)) 110 nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; 111 112 fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); 113 if (!fdt) 114 goto out; 115 fdt->max_fds = nr; 116 data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); 117 if (!data) 118 goto out_fdt; 119 fdt->fd = data; 120 121 data = kvmalloc(max_t(size_t, 122 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), 123 GFP_KERNEL_ACCOUNT); 124 if (!data) 125 goto out_arr; 126 fdt->open_fds = data; 127 data += nr / BITS_PER_BYTE; 128 fdt->close_on_exec = data; 129 data += nr / BITS_PER_BYTE; 130 fdt->full_fds_bits = data; 131 132 return fdt; 133 134 out_arr: 135 kvfree(fdt->fd); 136 out_fdt: 137 kfree(fdt); 138 out: 139 return NULL; 140 } 141 142 /* 143 * Expand the file descriptor table. 144 * This function will allocate a new fdtable and both fd array and fdset, of 145 * the given size. 146 * Return <0 error code on error; 1 on successful completion. 147 * The files->file_lock should be held on entry, and will be held on exit. 148 */ 149 static int expand_fdtable(struct files_struct *files, unsigned int nr) 150 __releases(files->file_lock) 151 __acquires(files->file_lock) 152 { 153 struct fdtable *new_fdt, *cur_fdt; 154 155 spin_unlock(&files->file_lock); 156 new_fdt = alloc_fdtable(nr); 157 158 /* make sure all __fd_install() have seen resize_in_progress 159 * or have finished their rcu_read_lock_sched() section. 160 */ 161 if (atomic_read(&files->count) > 1) 162 synchronize_rcu(); 163 164 spin_lock(&files->file_lock); 165 if (!new_fdt) 166 return -ENOMEM; 167 /* 168 * extremely unlikely race - sysctl_nr_open decreased between the check in 169 * caller and alloc_fdtable(). Cheaper to catch it here... 170 */ 171 if (unlikely(new_fdt->max_fds <= nr)) { 172 __free_fdtable(new_fdt); 173 return -EMFILE; 174 } 175 cur_fdt = files_fdtable(files); 176 BUG_ON(nr < cur_fdt->max_fds); 177 copy_fdtable(new_fdt, cur_fdt); 178 rcu_assign_pointer(files->fdt, new_fdt); 179 if (cur_fdt != &files->fdtab) 180 call_rcu(&cur_fdt->rcu, free_fdtable_rcu); 181 /* coupled with smp_rmb() in __fd_install() */ 182 smp_wmb(); 183 return 1; 184 } 185 186 /* 187 * Expand files. 188 * This function will expand the file structures, if the requested size exceeds 189 * the current capacity and there is room for expansion. 190 * Return <0 error code on error; 0 when nothing done; 1 when files were 191 * expanded and execution may have blocked. 192 * The files->file_lock should be held on entry, and will be held on exit. 193 */ 194 static int expand_files(struct files_struct *files, unsigned int nr) 195 __releases(files->file_lock) 196 __acquires(files->file_lock) 197 { 198 struct fdtable *fdt; 199 int expanded = 0; 200 201 repeat: 202 fdt = files_fdtable(files); 203 204 /* Do we need to expand? */ 205 if (nr < fdt->max_fds) 206 return expanded; 207 208 /* Can we expand? */ 209 if (nr >= sysctl_nr_open) 210 return -EMFILE; 211 212 if (unlikely(files->resize_in_progress)) { 213 spin_unlock(&files->file_lock); 214 expanded = 1; 215 wait_event(files->resize_wait, !files->resize_in_progress); 216 spin_lock(&files->file_lock); 217 goto repeat; 218 } 219 220 /* All good, so we try */ 221 files->resize_in_progress = true; 222 expanded = expand_fdtable(files, nr); 223 files->resize_in_progress = false; 224 225 wake_up_all(&files->resize_wait); 226 return expanded; 227 } 228 229 static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) 230 { 231 __set_bit(fd, fdt->close_on_exec); 232 } 233 234 static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) 235 { 236 if (test_bit(fd, fdt->close_on_exec)) 237 __clear_bit(fd, fdt->close_on_exec); 238 } 239 240 static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) 241 { 242 __set_bit(fd, fdt->open_fds); 243 fd /= BITS_PER_LONG; 244 if (!~fdt->open_fds[fd]) 245 __set_bit(fd, fdt->full_fds_bits); 246 } 247 248 static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) 249 { 250 __clear_bit(fd, fdt->open_fds); 251 __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); 252 } 253 254 static unsigned int count_open_files(struct fdtable *fdt) 255 { 256 unsigned int size = fdt->max_fds; 257 unsigned int i; 258 259 /* Find the last open fd */ 260 for (i = size / BITS_PER_LONG; i > 0; ) { 261 if (fdt->open_fds[--i]) 262 break; 263 } 264 i = (i + 1) * BITS_PER_LONG; 265 return i; 266 } 267 268 /* 269 * Allocate a new files structure and copy contents from the 270 * passed in files structure. 271 * errorp will be valid only when the returned files_struct is NULL. 272 */ 273 struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) 274 { 275 struct files_struct *newf; 276 struct file **old_fds, **new_fds; 277 unsigned int open_files, i; 278 struct fdtable *old_fdt, *new_fdt; 279 280 *errorp = -ENOMEM; 281 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); 282 if (!newf) 283 goto out; 284 285 atomic_set(&newf->count, 1); 286 287 spin_lock_init(&newf->file_lock); 288 newf->resize_in_progress = false; 289 init_waitqueue_head(&newf->resize_wait); 290 newf->next_fd = 0; 291 new_fdt = &newf->fdtab; 292 new_fdt->max_fds = NR_OPEN_DEFAULT; 293 new_fdt->close_on_exec = newf->close_on_exec_init; 294 new_fdt->open_fds = newf->open_fds_init; 295 new_fdt->full_fds_bits = newf->full_fds_bits_init; 296 new_fdt->fd = &newf->fd_array[0]; 297 298 spin_lock(&oldf->file_lock); 299 old_fdt = files_fdtable(oldf); 300 open_files = count_open_files(old_fdt); 301 302 /* 303 * Check whether we need to allocate a larger fd array and fd set. 304 */ 305 while (unlikely(open_files > new_fdt->max_fds)) { 306 spin_unlock(&oldf->file_lock); 307 308 if (new_fdt != &newf->fdtab) 309 __free_fdtable(new_fdt); 310 311 new_fdt = alloc_fdtable(open_files - 1); 312 if (!new_fdt) { 313 *errorp = -ENOMEM; 314 goto out_release; 315 } 316 317 /* beyond sysctl_nr_open; nothing to do */ 318 if (unlikely(new_fdt->max_fds < open_files)) { 319 __free_fdtable(new_fdt); 320 *errorp = -EMFILE; 321 goto out_release; 322 } 323 324 /* 325 * Reacquire the oldf lock and a pointer to its fd table 326 * who knows it may have a new bigger fd table. We need 327 * the latest pointer. 328 */ 329 spin_lock(&oldf->file_lock); 330 old_fdt = files_fdtable(oldf); 331 open_files = count_open_files(old_fdt); 332 } 333 334 copy_fd_bitmaps(new_fdt, old_fdt, open_files); 335 336 old_fds = old_fdt->fd; 337 new_fds = new_fdt->fd; 338 339 for (i = open_files; i != 0; i--) { 340 struct file *f = *old_fds++; 341 if (f) { 342 get_file(f); 343 } else { 344 /* 345 * The fd may be claimed in the fd bitmap but not yet 346 * instantiated in the files array if a sibling thread 347 * is partway through open(). So make sure that this 348 * fd is available to the new process. 349 */ 350 __clear_open_fd(open_files - i, new_fdt); 351 } 352 rcu_assign_pointer(*new_fds++, f); 353 } 354 spin_unlock(&oldf->file_lock); 355 356 /* clear the remainder */ 357 memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); 358 359 rcu_assign_pointer(newf->fdt, new_fdt); 360 361 return newf; 362 363 out_release: 364 kmem_cache_free(files_cachep, newf); 365 out: 366 return NULL; 367 } 368 369 static struct fdtable *close_files(struct files_struct * files) 370 { 371 /* 372 * It is safe to dereference the fd table without RCU or 373 * ->file_lock because this is the last reference to the 374 * files structure. 375 */ 376 struct fdtable *fdt = rcu_dereference_raw(files->fdt); 377 unsigned int i, j = 0; 378 379 for (;;) { 380 unsigned long set; 381 i = j * BITS_PER_LONG; 382 if (i >= fdt->max_fds) 383 break; 384 set = fdt->open_fds[j++]; 385 while (set) { 386 if (set & 1) { 387 struct file * file = xchg(&fdt->fd[i], NULL); 388 if (file) { 389 filp_close(file, files); 390 cond_resched(); 391 } 392 } 393 i++; 394 set >>= 1; 395 } 396 } 397 398 return fdt; 399 } 400 401 struct files_struct *get_files_struct(struct task_struct *task) 402 { 403 struct files_struct *files; 404 405 task_lock(task); 406 files = task->files; 407 if (files) 408 atomic_inc(&files->count); 409 task_unlock(task); 410 411 return files; 412 } 413 414 void put_files_struct(struct files_struct *files) 415 { 416 if (atomic_dec_and_test(&files->count)) { 417 struct fdtable *fdt = close_files(files); 418 419 /* free the arrays if they are not embedded */ 420 if (fdt != &files->fdtab) 421 __free_fdtable(fdt); 422 kmem_cache_free(files_cachep, files); 423 } 424 } 425 426 void reset_files_struct(struct files_struct *files) 427 { 428 struct task_struct *tsk = current; 429 struct files_struct *old; 430 431 old = tsk->files; 432 task_lock(tsk); 433 tsk->files = files; 434 task_unlock(tsk); 435 put_files_struct(old); 436 } 437 438 void exit_files(struct task_struct *tsk) 439 { 440 struct files_struct * files = tsk->files; 441 442 if (files) { 443 task_lock(tsk); 444 tsk->files = NULL; 445 task_unlock(tsk); 446 put_files_struct(files); 447 } 448 } 449 450 struct files_struct init_files = { 451 .count = ATOMIC_INIT(1), 452 .fdt = &init_files.fdtab, 453 .fdtab = { 454 .max_fds = NR_OPEN_DEFAULT, 455 .fd = &init_files.fd_array[0], 456 .close_on_exec = init_files.close_on_exec_init, 457 .open_fds = init_files.open_fds_init, 458 .full_fds_bits = init_files.full_fds_bits_init, 459 }, 460 .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), 461 .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), 462 }; 463 464 static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) 465 { 466 unsigned int maxfd = fdt->max_fds; 467 unsigned int maxbit = maxfd / BITS_PER_LONG; 468 unsigned int bitbit = start / BITS_PER_LONG; 469 470 bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; 471 if (bitbit > maxfd) 472 return maxfd; 473 if (bitbit > start) 474 start = bitbit; 475 return find_next_zero_bit(fdt->open_fds, maxfd, start); 476 } 477 478 /* 479 * allocate a file descriptor, mark it busy. 480 */ 481 int __alloc_fd(struct files_struct *files, 482 unsigned start, unsigned end, unsigned flags) 483 { 484 unsigned int fd; 485 int error; 486 struct fdtable *fdt; 487 488 spin_lock(&files->file_lock); 489 repeat: 490 fdt = files_fdtable(files); 491 fd = start; 492 if (fd < files->next_fd) 493 fd = files->next_fd; 494 495 if (fd < fdt->max_fds) 496 fd = find_next_fd(fdt, fd); 497 498 /* 499 * N.B. For clone tasks sharing a files structure, this test 500 * will limit the total number of files that can be opened. 501 */ 502 error = -EMFILE; 503 if (fd >= end) 504 goto out; 505 506 error = expand_files(files, fd); 507 if (error < 0) 508 goto out; 509 510 /* 511 * If we needed to expand the fs array we 512 * might have blocked - try again. 513 */ 514 if (error) 515 goto repeat; 516 517 if (start <= files->next_fd) 518 files->next_fd = fd + 1; 519 520 __set_open_fd(fd, fdt); 521 if (flags & O_CLOEXEC) 522 __set_close_on_exec(fd, fdt); 523 else 524 __clear_close_on_exec(fd, fdt); 525 error = fd; 526 #if 1 527 /* Sanity check */ 528 if (rcu_access_pointer(fdt->fd[fd]) != NULL) { 529 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); 530 rcu_assign_pointer(fdt->fd[fd], NULL); 531 } 532 #endif 533 534 out: 535 spin_unlock(&files->file_lock); 536 return error; 537 } 538 539 static int alloc_fd(unsigned start, unsigned flags) 540 { 541 return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags); 542 } 543 544 int __get_unused_fd_flags(unsigned flags, unsigned long nofile) 545 { 546 return __alloc_fd(current->files, 0, nofile, flags); 547 } 548 549 int get_unused_fd_flags(unsigned flags) 550 { 551 return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); 552 } 553 EXPORT_SYMBOL(get_unused_fd_flags); 554 555 static void __put_unused_fd(struct files_struct *files, unsigned int fd) 556 { 557 struct fdtable *fdt = files_fdtable(files); 558 __clear_open_fd(fd, fdt); 559 if (fd < files->next_fd) 560 files->next_fd = fd; 561 } 562 563 void put_unused_fd(unsigned int fd) 564 { 565 struct files_struct *files = current->files; 566 spin_lock(&files->file_lock); 567 __put_unused_fd(files, fd); 568 spin_unlock(&files->file_lock); 569 } 570 571 EXPORT_SYMBOL(put_unused_fd); 572 573 /* 574 * Install a file pointer in the fd array. 575 * 576 * The VFS is full of places where we drop the files lock between 577 * setting the open_fds bitmap and installing the file in the file 578 * array. At any such point, we are vulnerable to a dup2() race 579 * installing a file in the array before us. We need to detect this and 580 * fput() the struct file we are about to overwrite in this case. 581 * 582 * It should never happen - if we allow dup2() do it, _really_ bad things 583 * will follow. 584 * 585 * NOTE: __fd_install() variant is really, really low-level; don't 586 * use it unless you are forced to by truly lousy API shoved down 587 * your throat. 'files' *MUST* be either current->files or obtained 588 * by get_files_struct(current) done by whoever had given it to you, 589 * or really bad things will happen. Normally you want to use 590 * fd_install() instead. 591 */ 592 593 void __fd_install(struct files_struct *files, unsigned int fd, 594 struct file *file) 595 { 596 struct fdtable *fdt; 597 598 rcu_read_lock_sched(); 599 600 if (unlikely(files->resize_in_progress)) { 601 rcu_read_unlock_sched(); 602 spin_lock(&files->file_lock); 603 fdt = files_fdtable(files); 604 BUG_ON(fdt->fd[fd] != NULL); 605 rcu_assign_pointer(fdt->fd[fd], file); 606 spin_unlock(&files->file_lock); 607 return; 608 } 609 /* coupled with smp_wmb() in expand_fdtable() */ 610 smp_rmb(); 611 fdt = rcu_dereference_sched(files->fdt); 612 BUG_ON(fdt->fd[fd] != NULL); 613 rcu_assign_pointer(fdt->fd[fd], file); 614 rcu_read_unlock_sched(); 615 } 616 617 void fd_install(unsigned int fd, struct file *file) 618 { 619 __fd_install(current->files, fd, file); 620 } 621 622 EXPORT_SYMBOL(fd_install); 623 624 static struct file *pick_file(struct files_struct *files, unsigned fd) 625 { 626 struct file *file = NULL; 627 struct fdtable *fdt; 628 629 spin_lock(&files->file_lock); 630 fdt = files_fdtable(files); 631 if (fd >= fdt->max_fds) 632 goto out_unlock; 633 file = fdt->fd[fd]; 634 if (!file) 635 goto out_unlock; 636 rcu_assign_pointer(fdt->fd[fd], NULL); 637 __put_unused_fd(files, fd); 638 639 out_unlock: 640 spin_unlock(&files->file_lock); 641 return file; 642 } 643 644 /* 645 * The same warnings as for __alloc_fd()/__fd_install() apply here... 646 */ 647 int __close_fd(struct files_struct *files, unsigned fd) 648 { 649 struct file *file; 650 651 file = pick_file(files, fd); 652 if (!file) 653 return -EBADF; 654 655 return filp_close(file, files); 656 } 657 EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ 658 659 /** 660 * __close_range() - Close all file descriptors in a given range. 661 * 662 * @fd: starting file descriptor to close 663 * @max_fd: last file descriptor to close 664 * 665 * This closes a range of file descriptors. All file descriptors 666 * from @fd up to and including @max_fd are closed. 667 */ 668 int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd) 669 { 670 unsigned int cur_max; 671 672 if (fd > max_fd) 673 return -EINVAL; 674 675 rcu_read_lock(); 676 cur_max = files_fdtable(files)->max_fds; 677 rcu_read_unlock(); 678 679 /* cap to last valid index into fdtable */ 680 cur_max--; 681 682 max_fd = min(max_fd, cur_max); 683 while (fd <= max_fd) { 684 struct file *file; 685 686 file = pick_file(files, fd++); 687 if (!file) 688 continue; 689 690 filp_close(file, files); 691 cond_resched(); 692 } 693 694 return 0; 695 } 696 697 /* 698 * variant of __close_fd that gets a ref on the file for later fput. 699 * The caller must ensure that filp_close() called on the file, and then 700 * an fput(). 701 */ 702 int __close_fd_get_file(unsigned int fd, struct file **res) 703 { 704 struct files_struct *files = current->files; 705 struct file *file; 706 struct fdtable *fdt; 707 708 spin_lock(&files->file_lock); 709 fdt = files_fdtable(files); 710 if (fd >= fdt->max_fds) 711 goto out_unlock; 712 file = fdt->fd[fd]; 713 if (!file) 714 goto out_unlock; 715 rcu_assign_pointer(fdt->fd[fd], NULL); 716 __put_unused_fd(files, fd); 717 spin_unlock(&files->file_lock); 718 get_file(file); 719 *res = file; 720 return 0; 721 722 out_unlock: 723 spin_unlock(&files->file_lock); 724 *res = NULL; 725 return -ENOENT; 726 } 727 728 void do_close_on_exec(struct files_struct *files) 729 { 730 unsigned i; 731 struct fdtable *fdt; 732 733 /* exec unshares first */ 734 spin_lock(&files->file_lock); 735 for (i = 0; ; i++) { 736 unsigned long set; 737 unsigned fd = i * BITS_PER_LONG; 738 fdt = files_fdtable(files); 739 if (fd >= fdt->max_fds) 740 break; 741 set = fdt->close_on_exec[i]; 742 if (!set) 743 continue; 744 fdt->close_on_exec[i] = 0; 745 for ( ; set ; fd++, set >>= 1) { 746 struct file *file; 747 if (!(set & 1)) 748 continue; 749 file = fdt->fd[fd]; 750 if (!file) 751 continue; 752 rcu_assign_pointer(fdt->fd[fd], NULL); 753 __put_unused_fd(files, fd); 754 spin_unlock(&files->file_lock); 755 filp_close(file, files); 756 cond_resched(); 757 spin_lock(&files->file_lock); 758 } 759 760 } 761 spin_unlock(&files->file_lock); 762 } 763 764 static struct file *__fget_files(struct files_struct *files, unsigned int fd, 765 fmode_t mask, unsigned int refs) 766 { 767 struct file *file; 768 769 rcu_read_lock(); 770 loop: 771 file = fcheck_files(files, fd); 772 if (file) { 773 /* File object ref couldn't be taken. 774 * dup2() atomicity guarantee is the reason 775 * we loop to catch the new file (or NULL pointer) 776 */ 777 if (file->f_mode & mask) 778 file = NULL; 779 else if (!get_file_rcu_many(file, refs)) 780 goto loop; 781 } 782 rcu_read_unlock(); 783 784 return file; 785 } 786 787 static inline struct file *__fget(unsigned int fd, fmode_t mask, 788 unsigned int refs) 789 { 790 return __fget_files(current->files, fd, mask, refs); 791 } 792 793 struct file *fget_many(unsigned int fd, unsigned int refs) 794 { 795 return __fget(fd, FMODE_PATH, refs); 796 } 797 798 struct file *fget(unsigned int fd) 799 { 800 return __fget(fd, FMODE_PATH, 1); 801 } 802 EXPORT_SYMBOL(fget); 803 804 struct file *fget_raw(unsigned int fd) 805 { 806 return __fget(fd, 0, 1); 807 } 808 EXPORT_SYMBOL(fget_raw); 809 810 struct file *fget_task(struct task_struct *task, unsigned int fd) 811 { 812 struct file *file = NULL; 813 814 task_lock(task); 815 if (task->files) 816 file = __fget_files(task->files, fd, 0, 1); 817 task_unlock(task); 818 819 return file; 820 } 821 822 /* 823 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 824 * 825 * You can use this instead of fget if you satisfy all of the following 826 * conditions: 827 * 1) You must call fput_light before exiting the syscall and returning control 828 * to userspace (i.e. you cannot remember the returned struct file * after 829 * returning to userspace). 830 * 2) You must not call filp_close on the returned struct file * in between 831 * calls to fget_light and fput_light. 832 * 3) You must not clone the current task in between the calls to fget_light 833 * and fput_light. 834 * 835 * The fput_needed flag returned by fget_light should be passed to the 836 * corresponding fput_light. 837 */ 838 static unsigned long __fget_light(unsigned int fd, fmode_t mask) 839 { 840 struct files_struct *files = current->files; 841 struct file *file; 842 843 if (atomic_read(&files->count) == 1) { 844 file = __fcheck_files(files, fd); 845 if (!file || unlikely(file->f_mode & mask)) 846 return 0; 847 return (unsigned long)file; 848 } else { 849 file = __fget(fd, mask, 1); 850 if (!file) 851 return 0; 852 return FDPUT_FPUT | (unsigned long)file; 853 } 854 } 855 unsigned long __fdget(unsigned int fd) 856 { 857 return __fget_light(fd, FMODE_PATH); 858 } 859 EXPORT_SYMBOL(__fdget); 860 861 unsigned long __fdget_raw(unsigned int fd) 862 { 863 return __fget_light(fd, 0); 864 } 865 866 unsigned long __fdget_pos(unsigned int fd) 867 { 868 unsigned long v = __fdget(fd); 869 struct file *file = (struct file *)(v & ~3); 870 871 if (file && (file->f_mode & FMODE_ATOMIC_POS)) { 872 if (file_count(file) > 1) { 873 v |= FDPUT_POS_UNLOCK; 874 mutex_lock(&file->f_pos_lock); 875 } 876 } 877 return v; 878 } 879 880 void __f_unlock_pos(struct file *f) 881 { 882 mutex_unlock(&f->f_pos_lock); 883 } 884 885 /* 886 * We only lock f_pos if we have threads or if the file might be 887 * shared with another process. In both cases we'll have an elevated 888 * file count (done either by fdget() or by fork()). 889 */ 890 891 void set_close_on_exec(unsigned int fd, int flag) 892 { 893 struct files_struct *files = current->files; 894 struct fdtable *fdt; 895 spin_lock(&files->file_lock); 896 fdt = files_fdtable(files); 897 if (flag) 898 __set_close_on_exec(fd, fdt); 899 else 900 __clear_close_on_exec(fd, fdt); 901 spin_unlock(&files->file_lock); 902 } 903 904 bool get_close_on_exec(unsigned int fd) 905 { 906 struct files_struct *files = current->files; 907 struct fdtable *fdt; 908 bool res; 909 rcu_read_lock(); 910 fdt = files_fdtable(files); 911 res = close_on_exec(fd, fdt); 912 rcu_read_unlock(); 913 return res; 914 } 915 916 static int do_dup2(struct files_struct *files, 917 struct file *file, unsigned fd, unsigned flags) 918 __releases(&files->file_lock) 919 { 920 struct file *tofree; 921 struct fdtable *fdt; 922 923 /* 924 * We need to detect attempts to do dup2() over allocated but still 925 * not finished descriptor. NB: OpenBSD avoids that at the price of 926 * extra work in their equivalent of fget() - they insert struct 927 * file immediately after grabbing descriptor, mark it larval if 928 * more work (e.g. actual opening) is needed and make sure that 929 * fget() treats larval files as absent. Potentially interesting, 930 * but while extra work in fget() is trivial, locking implications 931 * and amount of surgery on open()-related paths in VFS are not. 932 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" 933 * deadlocks in rather amusing ways, AFAICS. All of that is out of 934 * scope of POSIX or SUS, since neither considers shared descriptor 935 * tables and this condition does not arise without those. 936 */ 937 fdt = files_fdtable(files); 938 tofree = fdt->fd[fd]; 939 if (!tofree && fd_is_open(fd, fdt)) 940 goto Ebusy; 941 get_file(file); 942 rcu_assign_pointer(fdt->fd[fd], file); 943 __set_open_fd(fd, fdt); 944 if (flags & O_CLOEXEC) 945 __set_close_on_exec(fd, fdt); 946 else 947 __clear_close_on_exec(fd, fdt); 948 spin_unlock(&files->file_lock); 949 950 if (tofree) 951 filp_close(tofree, files); 952 953 return fd; 954 955 Ebusy: 956 spin_unlock(&files->file_lock); 957 return -EBUSY; 958 } 959 960 int replace_fd(unsigned fd, struct file *file, unsigned flags) 961 { 962 int err; 963 struct files_struct *files = current->files; 964 965 if (!file) 966 return __close_fd(files, fd); 967 968 if (fd >= rlimit(RLIMIT_NOFILE)) 969 return -EBADF; 970 971 spin_lock(&files->file_lock); 972 err = expand_files(files, fd); 973 if (unlikely(err < 0)) 974 goto out_unlock; 975 return do_dup2(files, file, fd, flags); 976 977 out_unlock: 978 spin_unlock(&files->file_lock); 979 return err; 980 } 981 982 static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) 983 { 984 int err = -EBADF; 985 struct file *file; 986 struct files_struct *files = current->files; 987 988 if ((flags & ~O_CLOEXEC) != 0) 989 return -EINVAL; 990 991 if (unlikely(oldfd == newfd)) 992 return -EINVAL; 993 994 if (newfd >= rlimit(RLIMIT_NOFILE)) 995 return -EBADF; 996 997 spin_lock(&files->file_lock); 998 err = expand_files(files, newfd); 999 file = fcheck(oldfd); 1000 if (unlikely(!file)) 1001 goto Ebadf; 1002 if (unlikely(err < 0)) { 1003 if (err == -EMFILE) 1004 goto Ebadf; 1005 goto out_unlock; 1006 } 1007 return do_dup2(files, file, newfd, flags); 1008 1009 Ebadf: 1010 err = -EBADF; 1011 out_unlock: 1012 spin_unlock(&files->file_lock); 1013 return err; 1014 } 1015 1016 SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) 1017 { 1018 return ksys_dup3(oldfd, newfd, flags); 1019 } 1020 1021 SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) 1022 { 1023 if (unlikely(newfd == oldfd)) { /* corner case */ 1024 struct files_struct *files = current->files; 1025 int retval = oldfd; 1026 1027 rcu_read_lock(); 1028 if (!fcheck_files(files, oldfd)) 1029 retval = -EBADF; 1030 rcu_read_unlock(); 1031 return retval; 1032 } 1033 return ksys_dup3(oldfd, newfd, 0); 1034 } 1035 1036 int ksys_dup(unsigned int fildes) 1037 { 1038 int ret = -EBADF; 1039 struct file *file = fget_raw(fildes); 1040 1041 if (file) { 1042 ret = get_unused_fd_flags(0); 1043 if (ret >= 0) 1044 fd_install(ret, file); 1045 else 1046 fput(file); 1047 } 1048 return ret; 1049 } 1050 1051 SYSCALL_DEFINE1(dup, unsigned int, fildes) 1052 { 1053 return ksys_dup(fildes); 1054 } 1055 1056 int f_dupfd(unsigned int from, struct file *file, unsigned flags) 1057 { 1058 int err; 1059 if (from >= rlimit(RLIMIT_NOFILE)) 1060 return -EINVAL; 1061 err = alloc_fd(from, flags); 1062 if (err >= 0) { 1063 get_file(file); 1064 fd_install(err, file); 1065 } 1066 return err; 1067 } 1068 1069 int iterate_fd(struct files_struct *files, unsigned n, 1070 int (*f)(const void *, struct file *, unsigned), 1071 const void *p) 1072 { 1073 struct fdtable *fdt; 1074 int res = 0; 1075 if (!files) 1076 return 0; 1077 spin_lock(&files->file_lock); 1078 for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { 1079 struct file *file; 1080 file = rcu_dereference_check_fdtable(files, fdt->fd[n]); 1081 if (!file) 1082 continue; 1083 res = f(p, file, n); 1084 if (res) 1085 break; 1086 } 1087 spin_unlock(&files->file_lock); 1088 return res; 1089 } 1090 EXPORT_SYMBOL(iterate_fd); 1091