1 /* 2 * linux/fs/file.c 3 * 4 * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes 5 * 6 * Manage the dynamic fd arrays in the process files_struct. 7 */ 8 9 #include <linux/syscalls.h> 10 #include <linux/export.h> 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include <linux/mmzone.h> 14 #include <linux/time.h> 15 #include <linux/sched.h> 16 #include <linux/slab.h> 17 #include <linux/vmalloc.h> 18 #include <linux/file.h> 19 #include <linux/fdtable.h> 20 #include <linux/bitops.h> 21 #include <linux/interrupt.h> 22 #include <linux/spinlock.h> 23 #include <linux/rcupdate.h> 24 #include <linux/workqueue.h> 25 26 int sysctl_nr_open __read_mostly = 1024*1024; 27 int sysctl_nr_open_min = BITS_PER_LONG; 28 int sysctl_nr_open_max = 1024 * 1024; /* raised later */ 29 30 static void *alloc_fdmem(size_t size) 31 { 32 /* 33 * Very large allocations can stress page reclaim, so fall back to 34 * vmalloc() if the allocation size will be considered "large" by the VM. 35 */ 36 if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 37 void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN); 38 if (data != NULL) 39 return data; 40 } 41 return vmalloc(size); 42 } 43 44 static void free_fdmem(void *ptr) 45 { 46 is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr); 47 } 48 49 static void __free_fdtable(struct fdtable *fdt) 50 { 51 free_fdmem(fdt->fd); 52 free_fdmem(fdt->open_fds); 53 kfree(fdt); 54 } 55 56 static void free_fdtable_rcu(struct rcu_head *rcu) 57 { 58 __free_fdtable(container_of(rcu, struct fdtable, rcu)); 59 } 60 61 /* 62 * Expand the fdset in the files_struct. Called with the files spinlock 63 * held for write. 64 */ 65 static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) 66 { 67 unsigned int cpy, set; 68 69 BUG_ON(nfdt->max_fds < ofdt->max_fds); 70 71 cpy = ofdt->max_fds * sizeof(struct file *); 72 set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); 73 memcpy(nfdt->fd, ofdt->fd, cpy); 74 memset((char *)(nfdt->fd) + cpy, 0, set); 75 76 cpy = ofdt->max_fds / BITS_PER_BYTE; 77 set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE; 78 memcpy(nfdt->open_fds, ofdt->open_fds, cpy); 79 memset((char *)(nfdt->open_fds) + cpy, 0, set); 80 memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); 81 memset((char *)(nfdt->close_on_exec) + cpy, 0, set); 82 } 83 84 static struct fdtable * alloc_fdtable(unsigned int nr) 85 { 86 struct fdtable *fdt; 87 void *data; 88 89 /* 90 * Figure out how many fds we actually want to support in this fdtable. 91 * Allocation steps are keyed to the size of the fdarray, since it 92 * grows far faster than any of the other dynamic data. We try to fit 93 * the fdarray into comfortable page-tuned chunks: starting at 1024B 94 * and growing in powers of two from there on. 95 */ 96 nr /= (1024 / sizeof(struct file *)); 97 nr = roundup_pow_of_two(nr + 1); 98 nr *= (1024 / sizeof(struct file *)); 99 /* 100 * Note that this can drive nr *below* what we had passed if sysctl_nr_open 101 * had been set lower between the check in expand_files() and here. Deal 102 * with that in caller, it's cheaper that way. 103 * 104 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise 105 * bitmaps handling below becomes unpleasant, to put it mildly... 106 */ 107 if (unlikely(nr > sysctl_nr_open)) 108 nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; 109 110 fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); 111 if (!fdt) 112 goto out; 113 fdt->max_fds = nr; 114 data = alloc_fdmem(nr * sizeof(struct file *)); 115 if (!data) 116 goto out_fdt; 117 fdt->fd = data; 118 119 data = alloc_fdmem(max_t(size_t, 120 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); 121 if (!data) 122 goto out_arr; 123 fdt->open_fds = data; 124 data += nr / BITS_PER_BYTE; 125 fdt->close_on_exec = data; 126 127 return fdt; 128 129 out_arr: 130 free_fdmem(fdt->fd); 131 out_fdt: 132 kfree(fdt); 133 out: 134 return NULL; 135 } 136 137 /* 138 * Expand the file descriptor table. 139 * This function will allocate a new fdtable and both fd array and fdset, of 140 * the given size. 141 * Return <0 error code on error; 1 on successful completion. 142 * The files->file_lock should be held on entry, and will be held on exit. 143 */ 144 static int expand_fdtable(struct files_struct *files, int nr) 145 __releases(files->file_lock) 146 __acquires(files->file_lock) 147 { 148 struct fdtable *new_fdt, *cur_fdt; 149 150 spin_unlock(&files->file_lock); 151 new_fdt = alloc_fdtable(nr); 152 spin_lock(&files->file_lock); 153 if (!new_fdt) 154 return -ENOMEM; 155 /* 156 * extremely unlikely race - sysctl_nr_open decreased between the check in 157 * caller and alloc_fdtable(). Cheaper to catch it here... 158 */ 159 if (unlikely(new_fdt->max_fds <= nr)) { 160 __free_fdtable(new_fdt); 161 return -EMFILE; 162 } 163 /* 164 * Check again since another task may have expanded the fd table while 165 * we dropped the lock 166 */ 167 cur_fdt = files_fdtable(files); 168 if (nr >= cur_fdt->max_fds) { 169 /* Continue as planned */ 170 copy_fdtable(new_fdt, cur_fdt); 171 rcu_assign_pointer(files->fdt, new_fdt); 172 if (cur_fdt != &files->fdtab) 173 call_rcu(&cur_fdt->rcu, free_fdtable_rcu); 174 } else { 175 /* Somebody else expanded, so undo our attempt */ 176 __free_fdtable(new_fdt); 177 } 178 return 1; 179 } 180 181 /* 182 * Expand files. 183 * This function will expand the file structures, if the requested size exceeds 184 * the current capacity and there is room for expansion. 185 * Return <0 error code on error; 0 when nothing done; 1 when files were 186 * expanded and execution may have blocked. 187 * The files->file_lock should be held on entry, and will be held on exit. 188 */ 189 static int expand_files(struct files_struct *files, int nr) 190 { 191 struct fdtable *fdt; 192 193 fdt = files_fdtable(files); 194 195 /* Do we need to expand? */ 196 if (nr < fdt->max_fds) 197 return 0; 198 199 /* Can we expand? */ 200 if (nr >= sysctl_nr_open) 201 return -EMFILE; 202 203 /* All good, so we try */ 204 return expand_fdtable(files, nr); 205 } 206 207 static inline void __set_close_on_exec(int fd, struct fdtable *fdt) 208 { 209 __set_bit(fd, fdt->close_on_exec); 210 } 211 212 static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) 213 { 214 __clear_bit(fd, fdt->close_on_exec); 215 } 216 217 static inline void __set_open_fd(int fd, struct fdtable *fdt) 218 { 219 __set_bit(fd, fdt->open_fds); 220 } 221 222 static inline void __clear_open_fd(int fd, struct fdtable *fdt) 223 { 224 __clear_bit(fd, fdt->open_fds); 225 } 226 227 static int count_open_files(struct fdtable *fdt) 228 { 229 int size = fdt->max_fds; 230 int i; 231 232 /* Find the last open fd */ 233 for (i = size / BITS_PER_LONG; i > 0; ) { 234 if (fdt->open_fds[--i]) 235 break; 236 } 237 i = (i + 1) * BITS_PER_LONG; 238 return i; 239 } 240 241 /* 242 * Allocate a new files structure and copy contents from the 243 * passed in files structure. 244 * errorp will be valid only when the returned files_struct is NULL. 245 */ 246 struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) 247 { 248 struct files_struct *newf; 249 struct file **old_fds, **new_fds; 250 int open_files, size, i; 251 struct fdtable *old_fdt, *new_fdt; 252 253 *errorp = -ENOMEM; 254 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); 255 if (!newf) 256 goto out; 257 258 atomic_set(&newf->count, 1); 259 260 spin_lock_init(&newf->file_lock); 261 newf->next_fd = 0; 262 new_fdt = &newf->fdtab; 263 new_fdt->max_fds = NR_OPEN_DEFAULT; 264 new_fdt->close_on_exec = newf->close_on_exec_init; 265 new_fdt->open_fds = newf->open_fds_init; 266 new_fdt->fd = &newf->fd_array[0]; 267 268 spin_lock(&oldf->file_lock); 269 old_fdt = files_fdtable(oldf); 270 open_files = count_open_files(old_fdt); 271 272 /* 273 * Check whether we need to allocate a larger fd array and fd set. 274 */ 275 while (unlikely(open_files > new_fdt->max_fds)) { 276 spin_unlock(&oldf->file_lock); 277 278 if (new_fdt != &newf->fdtab) 279 __free_fdtable(new_fdt); 280 281 new_fdt = alloc_fdtable(open_files - 1); 282 if (!new_fdt) { 283 *errorp = -ENOMEM; 284 goto out_release; 285 } 286 287 /* beyond sysctl_nr_open; nothing to do */ 288 if (unlikely(new_fdt->max_fds < open_files)) { 289 __free_fdtable(new_fdt); 290 *errorp = -EMFILE; 291 goto out_release; 292 } 293 294 /* 295 * Reacquire the oldf lock and a pointer to its fd table 296 * who knows it may have a new bigger fd table. We need 297 * the latest pointer. 298 */ 299 spin_lock(&oldf->file_lock); 300 old_fdt = files_fdtable(oldf); 301 open_files = count_open_files(old_fdt); 302 } 303 304 old_fds = old_fdt->fd; 305 new_fds = new_fdt->fd; 306 307 memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); 308 memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); 309 310 for (i = open_files; i != 0; i--) { 311 struct file *f = *old_fds++; 312 if (f) { 313 get_file(f); 314 } else { 315 /* 316 * The fd may be claimed in the fd bitmap but not yet 317 * instantiated in the files array if a sibling thread 318 * is partway through open(). So make sure that this 319 * fd is available to the new process. 320 */ 321 __clear_open_fd(open_files - i, new_fdt); 322 } 323 rcu_assign_pointer(*new_fds++, f); 324 } 325 spin_unlock(&oldf->file_lock); 326 327 /* compute the remainder to be cleared */ 328 size = (new_fdt->max_fds - open_files) * sizeof(struct file *); 329 330 /* This is long word aligned thus could use a optimized version */ 331 memset(new_fds, 0, size); 332 333 if (new_fdt->max_fds > open_files) { 334 int left = (new_fdt->max_fds - open_files) / 8; 335 int start = open_files / BITS_PER_LONG; 336 337 memset(&new_fdt->open_fds[start], 0, left); 338 memset(&new_fdt->close_on_exec[start], 0, left); 339 } 340 341 rcu_assign_pointer(newf->fdt, new_fdt); 342 343 return newf; 344 345 out_release: 346 kmem_cache_free(files_cachep, newf); 347 out: 348 return NULL; 349 } 350 351 static void close_files(struct files_struct * files) 352 { 353 int i, j; 354 struct fdtable *fdt; 355 356 j = 0; 357 358 /* 359 * It is safe to dereference the fd table without RCU or 360 * ->file_lock because this is the last reference to the 361 * files structure. But use RCU to shut RCU-lockdep up. 362 */ 363 rcu_read_lock(); 364 fdt = files_fdtable(files); 365 rcu_read_unlock(); 366 for (;;) { 367 unsigned long set; 368 i = j * BITS_PER_LONG; 369 if (i >= fdt->max_fds) 370 break; 371 set = fdt->open_fds[j++]; 372 while (set) { 373 if (set & 1) { 374 struct file * file = xchg(&fdt->fd[i], NULL); 375 if (file) { 376 filp_close(file, files); 377 cond_resched(); 378 } 379 } 380 i++; 381 set >>= 1; 382 } 383 } 384 } 385 386 struct files_struct *get_files_struct(struct task_struct *task) 387 { 388 struct files_struct *files; 389 390 task_lock(task); 391 files = task->files; 392 if (files) 393 atomic_inc(&files->count); 394 task_unlock(task); 395 396 return files; 397 } 398 399 void put_files_struct(struct files_struct *files) 400 { 401 struct fdtable *fdt; 402 403 if (atomic_dec_and_test(&files->count)) { 404 close_files(files); 405 /* not really needed, since nobody can see us */ 406 rcu_read_lock(); 407 fdt = files_fdtable(files); 408 rcu_read_unlock(); 409 /* free the arrays if they are not embedded */ 410 if (fdt != &files->fdtab) 411 __free_fdtable(fdt); 412 kmem_cache_free(files_cachep, files); 413 } 414 } 415 416 void reset_files_struct(struct files_struct *files) 417 { 418 struct task_struct *tsk = current; 419 struct files_struct *old; 420 421 old = tsk->files; 422 task_lock(tsk); 423 tsk->files = files; 424 task_unlock(tsk); 425 put_files_struct(old); 426 } 427 428 void exit_files(struct task_struct *tsk) 429 { 430 struct files_struct * files = tsk->files; 431 432 if (files) { 433 task_lock(tsk); 434 tsk->files = NULL; 435 task_unlock(tsk); 436 put_files_struct(files); 437 } 438 } 439 440 void __init files_defer_init(void) 441 { 442 sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & 443 -BITS_PER_LONG; 444 } 445 446 struct files_struct init_files = { 447 .count = ATOMIC_INIT(1), 448 .fdt = &init_files.fdtab, 449 .fdtab = { 450 .max_fds = NR_OPEN_DEFAULT, 451 .fd = &init_files.fd_array[0], 452 .close_on_exec = init_files.close_on_exec_init, 453 .open_fds = init_files.open_fds_init, 454 }, 455 .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), 456 }; 457 458 /* 459 * allocate a file descriptor, mark it busy. 460 */ 461 int __alloc_fd(struct files_struct *files, 462 unsigned start, unsigned end, unsigned flags) 463 { 464 unsigned int fd; 465 int error; 466 struct fdtable *fdt; 467 468 spin_lock(&files->file_lock); 469 repeat: 470 fdt = files_fdtable(files); 471 fd = start; 472 if (fd < files->next_fd) 473 fd = files->next_fd; 474 475 if (fd < fdt->max_fds) 476 fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); 477 478 /* 479 * N.B. For clone tasks sharing a files structure, this test 480 * will limit the total number of files that can be opened. 481 */ 482 error = -EMFILE; 483 if (fd >= end) 484 goto out; 485 486 error = expand_files(files, fd); 487 if (error < 0) 488 goto out; 489 490 /* 491 * If we needed to expand the fs array we 492 * might have blocked - try again. 493 */ 494 if (error) 495 goto repeat; 496 497 if (start <= files->next_fd) 498 files->next_fd = fd + 1; 499 500 __set_open_fd(fd, fdt); 501 if (flags & O_CLOEXEC) 502 __set_close_on_exec(fd, fdt); 503 else 504 __clear_close_on_exec(fd, fdt); 505 error = fd; 506 #if 1 507 /* Sanity check */ 508 if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { 509 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); 510 rcu_assign_pointer(fdt->fd[fd], NULL); 511 } 512 #endif 513 514 out: 515 spin_unlock(&files->file_lock); 516 return error; 517 } 518 519 static int alloc_fd(unsigned start, unsigned flags) 520 { 521 return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags); 522 } 523 524 int get_unused_fd_flags(unsigned flags) 525 { 526 return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags); 527 } 528 EXPORT_SYMBOL(get_unused_fd_flags); 529 530 static void __put_unused_fd(struct files_struct *files, unsigned int fd) 531 { 532 struct fdtable *fdt = files_fdtable(files); 533 __clear_open_fd(fd, fdt); 534 if (fd < files->next_fd) 535 files->next_fd = fd; 536 } 537 538 void put_unused_fd(unsigned int fd) 539 { 540 struct files_struct *files = current->files; 541 spin_lock(&files->file_lock); 542 __put_unused_fd(files, fd); 543 spin_unlock(&files->file_lock); 544 } 545 546 EXPORT_SYMBOL(put_unused_fd); 547 548 /* 549 * Install a file pointer in the fd array. 550 * 551 * The VFS is full of places where we drop the files lock between 552 * setting the open_fds bitmap and installing the file in the file 553 * array. At any such point, we are vulnerable to a dup2() race 554 * installing a file in the array before us. We need to detect this and 555 * fput() the struct file we are about to overwrite in this case. 556 * 557 * It should never happen - if we allow dup2() do it, _really_ bad things 558 * will follow. 559 * 560 * NOTE: __fd_install() variant is really, really low-level; don't 561 * use it unless you are forced to by truly lousy API shoved down 562 * your throat. 'files' *MUST* be either current->files or obtained 563 * by get_files_struct(current) done by whoever had given it to you, 564 * or really bad things will happen. Normally you want to use 565 * fd_install() instead. 566 */ 567 568 void __fd_install(struct files_struct *files, unsigned int fd, 569 struct file *file) 570 { 571 struct fdtable *fdt; 572 spin_lock(&files->file_lock); 573 fdt = files_fdtable(files); 574 BUG_ON(fdt->fd[fd] != NULL); 575 rcu_assign_pointer(fdt->fd[fd], file); 576 spin_unlock(&files->file_lock); 577 } 578 579 void fd_install(unsigned int fd, struct file *file) 580 { 581 __fd_install(current->files, fd, file); 582 } 583 584 EXPORT_SYMBOL(fd_install); 585 586 /* 587 * The same warnings as for __alloc_fd()/__fd_install() apply here... 588 */ 589 int __close_fd(struct files_struct *files, unsigned fd) 590 { 591 struct file *file; 592 struct fdtable *fdt; 593 594 spin_lock(&files->file_lock); 595 fdt = files_fdtable(files); 596 if (fd >= fdt->max_fds) 597 goto out_unlock; 598 file = fdt->fd[fd]; 599 if (!file) 600 goto out_unlock; 601 rcu_assign_pointer(fdt->fd[fd], NULL); 602 __clear_close_on_exec(fd, fdt); 603 __put_unused_fd(files, fd); 604 spin_unlock(&files->file_lock); 605 return filp_close(file, files); 606 607 out_unlock: 608 spin_unlock(&files->file_lock); 609 return -EBADF; 610 } 611 612 void do_close_on_exec(struct files_struct *files) 613 { 614 unsigned i; 615 struct fdtable *fdt; 616 617 /* exec unshares first */ 618 spin_lock(&files->file_lock); 619 for (i = 0; ; i++) { 620 unsigned long set; 621 unsigned fd = i * BITS_PER_LONG; 622 fdt = files_fdtable(files); 623 if (fd >= fdt->max_fds) 624 break; 625 set = fdt->close_on_exec[i]; 626 if (!set) 627 continue; 628 fdt->close_on_exec[i] = 0; 629 for ( ; set ; fd++, set >>= 1) { 630 struct file *file; 631 if (!(set & 1)) 632 continue; 633 file = fdt->fd[fd]; 634 if (!file) 635 continue; 636 rcu_assign_pointer(fdt->fd[fd], NULL); 637 __put_unused_fd(files, fd); 638 spin_unlock(&files->file_lock); 639 filp_close(file, files); 640 cond_resched(); 641 spin_lock(&files->file_lock); 642 } 643 644 } 645 spin_unlock(&files->file_lock); 646 } 647 648 struct file *fget(unsigned int fd) 649 { 650 struct file *file; 651 struct files_struct *files = current->files; 652 653 rcu_read_lock(); 654 file = fcheck_files(files, fd); 655 if (file) { 656 /* File object ref couldn't be taken */ 657 if (file->f_mode & FMODE_PATH || 658 !atomic_long_inc_not_zero(&file->f_count)) 659 file = NULL; 660 } 661 rcu_read_unlock(); 662 663 return file; 664 } 665 666 EXPORT_SYMBOL(fget); 667 668 struct file *fget_raw(unsigned int fd) 669 { 670 struct file *file; 671 struct files_struct *files = current->files; 672 673 rcu_read_lock(); 674 file = fcheck_files(files, fd); 675 if (file) { 676 /* File object ref couldn't be taken */ 677 if (!atomic_long_inc_not_zero(&file->f_count)) 678 file = NULL; 679 } 680 rcu_read_unlock(); 681 682 return file; 683 } 684 685 EXPORT_SYMBOL(fget_raw); 686 687 /* 688 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 689 * 690 * You can use this instead of fget if you satisfy all of the following 691 * conditions: 692 * 1) You must call fput_light before exiting the syscall and returning control 693 * to userspace (i.e. you cannot remember the returned struct file * after 694 * returning to userspace). 695 * 2) You must not call filp_close on the returned struct file * in between 696 * calls to fget_light and fput_light. 697 * 3) You must not clone the current task in between the calls to fget_light 698 * and fput_light. 699 * 700 * The fput_needed flag returned by fget_light should be passed to the 701 * corresponding fput_light. 702 */ 703 struct file *fget_light(unsigned int fd, int *fput_needed) 704 { 705 struct file *file; 706 struct files_struct *files = current->files; 707 708 *fput_needed = 0; 709 if (atomic_read(&files->count) == 1) { 710 file = fcheck_files(files, fd); 711 if (file && (file->f_mode & FMODE_PATH)) 712 file = NULL; 713 } else { 714 rcu_read_lock(); 715 file = fcheck_files(files, fd); 716 if (file) { 717 if (!(file->f_mode & FMODE_PATH) && 718 atomic_long_inc_not_zero(&file->f_count)) 719 *fput_needed = 1; 720 else 721 /* Didn't get the reference, someone's freed */ 722 file = NULL; 723 } 724 rcu_read_unlock(); 725 } 726 727 return file; 728 } 729 EXPORT_SYMBOL(fget_light); 730 731 struct file *fget_raw_light(unsigned int fd, int *fput_needed) 732 { 733 struct file *file; 734 struct files_struct *files = current->files; 735 736 *fput_needed = 0; 737 if (atomic_read(&files->count) == 1) { 738 file = fcheck_files(files, fd); 739 } else { 740 rcu_read_lock(); 741 file = fcheck_files(files, fd); 742 if (file) { 743 if (atomic_long_inc_not_zero(&file->f_count)) 744 *fput_needed = 1; 745 else 746 /* Didn't get the reference, someone's freed */ 747 file = NULL; 748 } 749 rcu_read_unlock(); 750 } 751 752 return file; 753 } 754 755 void set_close_on_exec(unsigned int fd, int flag) 756 { 757 struct files_struct *files = current->files; 758 struct fdtable *fdt; 759 spin_lock(&files->file_lock); 760 fdt = files_fdtable(files); 761 if (flag) 762 __set_close_on_exec(fd, fdt); 763 else 764 __clear_close_on_exec(fd, fdt); 765 spin_unlock(&files->file_lock); 766 } 767 768 bool get_close_on_exec(unsigned int fd) 769 { 770 struct files_struct *files = current->files; 771 struct fdtable *fdt; 772 bool res; 773 rcu_read_lock(); 774 fdt = files_fdtable(files); 775 res = close_on_exec(fd, fdt); 776 rcu_read_unlock(); 777 return res; 778 } 779 780 static int do_dup2(struct files_struct *files, 781 struct file *file, unsigned fd, unsigned flags) 782 { 783 struct file *tofree; 784 struct fdtable *fdt; 785 786 /* 787 * We need to detect attempts to do dup2() over allocated but still 788 * not finished descriptor. NB: OpenBSD avoids that at the price of 789 * extra work in their equivalent of fget() - they insert struct 790 * file immediately after grabbing descriptor, mark it larval if 791 * more work (e.g. actual opening) is needed and make sure that 792 * fget() treats larval files as absent. Potentially interesting, 793 * but while extra work in fget() is trivial, locking implications 794 * and amount of surgery on open()-related paths in VFS are not. 795 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" 796 * deadlocks in rather amusing ways, AFAICS. All of that is out of 797 * scope of POSIX or SUS, since neither considers shared descriptor 798 * tables and this condition does not arise without those. 799 */ 800 fdt = files_fdtable(files); 801 tofree = fdt->fd[fd]; 802 if (!tofree && fd_is_open(fd, fdt)) 803 goto Ebusy; 804 get_file(file); 805 rcu_assign_pointer(fdt->fd[fd], file); 806 __set_open_fd(fd, fdt); 807 if (flags & O_CLOEXEC) 808 __set_close_on_exec(fd, fdt); 809 else 810 __clear_close_on_exec(fd, fdt); 811 spin_unlock(&files->file_lock); 812 813 if (tofree) 814 filp_close(tofree, files); 815 816 return fd; 817 818 Ebusy: 819 spin_unlock(&files->file_lock); 820 return -EBUSY; 821 } 822 823 int replace_fd(unsigned fd, struct file *file, unsigned flags) 824 { 825 int err; 826 struct files_struct *files = current->files; 827 828 if (!file) 829 return __close_fd(files, fd); 830 831 if (fd >= rlimit(RLIMIT_NOFILE)) 832 return -EBADF; 833 834 spin_lock(&files->file_lock); 835 err = expand_files(files, fd); 836 if (unlikely(err < 0)) 837 goto out_unlock; 838 return do_dup2(files, file, fd, flags); 839 840 out_unlock: 841 spin_unlock(&files->file_lock); 842 return err; 843 } 844 845 SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) 846 { 847 int err = -EBADF; 848 struct file *file; 849 struct files_struct *files = current->files; 850 851 if ((flags & ~O_CLOEXEC) != 0) 852 return -EINVAL; 853 854 if (unlikely(oldfd == newfd)) 855 return -EINVAL; 856 857 if (newfd >= rlimit(RLIMIT_NOFILE)) 858 return -EBADF; 859 860 spin_lock(&files->file_lock); 861 err = expand_files(files, newfd); 862 file = fcheck(oldfd); 863 if (unlikely(!file)) 864 goto Ebadf; 865 if (unlikely(err < 0)) { 866 if (err == -EMFILE) 867 goto Ebadf; 868 goto out_unlock; 869 } 870 return do_dup2(files, file, newfd, flags); 871 872 Ebadf: 873 err = -EBADF; 874 out_unlock: 875 spin_unlock(&files->file_lock); 876 return err; 877 } 878 879 SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) 880 { 881 if (unlikely(newfd == oldfd)) { /* corner case */ 882 struct files_struct *files = current->files; 883 int retval = oldfd; 884 885 rcu_read_lock(); 886 if (!fcheck_files(files, oldfd)) 887 retval = -EBADF; 888 rcu_read_unlock(); 889 return retval; 890 } 891 return sys_dup3(oldfd, newfd, 0); 892 } 893 894 SYSCALL_DEFINE1(dup, unsigned int, fildes) 895 { 896 int ret = -EBADF; 897 struct file *file = fget_raw(fildes); 898 899 if (file) { 900 ret = get_unused_fd(); 901 if (ret >= 0) 902 fd_install(ret, file); 903 else 904 fput(file); 905 } 906 return ret; 907 } 908 909 int f_dupfd(unsigned int from, struct file *file, unsigned flags) 910 { 911 int err; 912 if (from >= rlimit(RLIMIT_NOFILE)) 913 return -EINVAL; 914 err = alloc_fd(from, flags); 915 if (err >= 0) { 916 get_file(file); 917 fd_install(err, file); 918 } 919 return err; 920 } 921 922 int iterate_fd(struct files_struct *files, unsigned n, 923 int (*f)(const void *, struct file *, unsigned), 924 const void *p) 925 { 926 struct fdtable *fdt; 927 int res = 0; 928 if (!files) 929 return 0; 930 spin_lock(&files->file_lock); 931 for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { 932 struct file *file; 933 file = rcu_dereference_check_fdtable(files, fdt->fd[n]); 934 if (!file) 935 continue; 936 res = f(p, file, n); 937 if (res) 938 break; 939 } 940 spin_unlock(&files->file_lock); 941 return res; 942 } 943 EXPORT_SYMBOL(iterate_fd); 944