xref: /linux/fs/file.c (revision fdb53791195ce8fe99ba5e538689f5b3e5968ced)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   *  linux/fs/file.c
4   *
5   *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
6   *
7   *  Manage the dynamic fd arrays in the process files_struct.
8   */
9  
10  #include <linux/syscalls.h>
11  #include <linux/export.h>
12  #include <linux/fs.h>
13  #include <linux/kernel.h>
14  #include <linux/mm.h>
15  #include <linux/sched/signal.h>
16  #include <linux/slab.h>
17  #include <linux/file.h>
18  #include <linux/fdtable.h>
19  #include <linux/bitops.h>
20  #include <linux/spinlock.h>
21  #include <linux/rcupdate.h>
22  #include <linux/close_range.h>
23  #include <net/sock.h>
24  
25  #include "internal.h"
26  
27  unsigned int sysctl_nr_open __read_mostly = 1024*1024;
28  unsigned int sysctl_nr_open_min = BITS_PER_LONG;
29  /* our min() is unusable in constant expressions ;-/ */
30  #define __const_min(x, y) ((x) < (y) ? (x) : (y))
31  unsigned int sysctl_nr_open_max =
32  	__const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;
33  
34  static void __free_fdtable(struct fdtable *fdt)
35  {
36  	kvfree(fdt->fd);
37  	kvfree(fdt->open_fds);
38  	kfree(fdt);
39  }
40  
41  static void free_fdtable_rcu(struct rcu_head *rcu)
42  {
43  	__free_fdtable(container_of(rcu, struct fdtable, rcu));
44  }
45  
46  #define BITBIT_NR(nr)	BITS_TO_LONGS(BITS_TO_LONGS(nr))
47  #define BITBIT_SIZE(nr)	(BITBIT_NR(nr) * sizeof(long))
48  
49  #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
50  /*
51   * Copy 'count' fd bits from the old table to the new table and clear the extra
52   * space if any.  This does not copy the file pointers.  Called with the files
53   * spinlock held for write.
54   */
55  static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
56  			    unsigned int copy_words)
57  {
58  	unsigned int nwords = fdt_words(nfdt);
59  
60  	bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
61  			copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
62  	bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
63  			copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
64  	bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
65  			copy_words, nwords);
66  }
67  
68  /*
69   * Copy all file descriptors from the old table to the new, expanded table and
70   * clear the extra space.  Called with the files spinlock held for write.
71   */
72  static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
73  {
74  	size_t cpy, set;
75  
76  	BUG_ON(nfdt->max_fds < ofdt->max_fds);
77  
78  	cpy = ofdt->max_fds * sizeof(struct file *);
79  	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
80  	memcpy(nfdt->fd, ofdt->fd, cpy);
81  	memset((char *)nfdt->fd + cpy, 0, set);
82  
83  	copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
84  }
85  
86  /*
87   * Note how the fdtable bitmap allocations very much have to be a multiple of
88   * BITS_PER_LONG. This is not only because we walk those things in chunks of
89   * 'unsigned long' in some places, but simply because that is how the Linux
90   * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
91   * they are very much "bits in an array of unsigned long".
92   *
93   * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
94   * by that "1024/sizeof(ptr)" before, we already know there are sufficient
95   * clear low bits. Clang seems to realize that, gcc ends up being confused.
96   *
97   * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
98   * let's consider it documentation (and maybe a test-case for gcc to improve
99   * its code generation ;)
100   */
101  static struct fdtable * alloc_fdtable(unsigned int nr)
102  {
103  	struct fdtable *fdt;
104  	void *data;
105  
106  	/*
107  	 * Figure out how many fds we actually want to support in this fdtable.
108  	 * Allocation steps are keyed to the size of the fdarray, since it
109  	 * grows far faster than any of the other dynamic data. We try to fit
110  	 * the fdarray into comfortable page-tuned chunks: starting at 1024B
111  	 * and growing in powers of two from there on.
112  	 */
113  	nr /= (1024 / sizeof(struct file *));
114  	nr = roundup_pow_of_two(nr + 1);
115  	nr *= (1024 / sizeof(struct file *));
116  	nr = ALIGN(nr, BITS_PER_LONG);
117  	/*
118  	 * Note that this can drive nr *below* what we had passed if sysctl_nr_open
119  	 * had been set lower between the check in expand_files() and here.  Deal
120  	 * with that in caller, it's cheaper that way.
121  	 *
122  	 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
123  	 * bitmaps handling below becomes unpleasant, to put it mildly...
124  	 */
125  	if (unlikely(nr > sysctl_nr_open))
126  		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
127  
128  	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
129  	if (!fdt)
130  		goto out;
131  	fdt->max_fds = nr;
132  	data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
133  	if (!data)
134  		goto out_fdt;
135  	fdt->fd = data;
136  
137  	data = kvmalloc(max_t(size_t,
138  				 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
139  				 GFP_KERNEL_ACCOUNT);
140  	if (!data)
141  		goto out_arr;
142  	fdt->open_fds = data;
143  	data += nr / BITS_PER_BYTE;
144  	fdt->close_on_exec = data;
145  	data += nr / BITS_PER_BYTE;
146  	fdt->full_fds_bits = data;
147  
148  	return fdt;
149  
150  out_arr:
151  	kvfree(fdt->fd);
152  out_fdt:
153  	kfree(fdt);
154  out:
155  	return NULL;
156  }
157  
158  /*
159   * Expand the file descriptor table.
160   * This function will allocate a new fdtable and both fd array and fdset, of
161   * the given size.
162   * Return <0 error code on error; 1 on successful completion.
163   * The files->file_lock should be held on entry, and will be held on exit.
164   */
165  static int expand_fdtable(struct files_struct *files, unsigned int nr)
166  	__releases(files->file_lock)
167  	__acquires(files->file_lock)
168  {
169  	struct fdtable *new_fdt, *cur_fdt;
170  
171  	spin_unlock(&files->file_lock);
172  	new_fdt = alloc_fdtable(nr);
173  
174  	/* make sure all fd_install() have seen resize_in_progress
175  	 * or have finished their rcu_read_lock_sched() section.
176  	 */
177  	if (atomic_read(&files->count) > 1)
178  		synchronize_rcu();
179  
180  	spin_lock(&files->file_lock);
181  	if (!new_fdt)
182  		return -ENOMEM;
183  	/*
184  	 * extremely unlikely race - sysctl_nr_open decreased between the check in
185  	 * caller and alloc_fdtable().  Cheaper to catch it here...
186  	 */
187  	if (unlikely(new_fdt->max_fds <= nr)) {
188  		__free_fdtable(new_fdt);
189  		return -EMFILE;
190  	}
191  	cur_fdt = files_fdtable(files);
192  	BUG_ON(nr < cur_fdt->max_fds);
193  	copy_fdtable(new_fdt, cur_fdt);
194  	rcu_assign_pointer(files->fdt, new_fdt);
195  	if (cur_fdt != &files->fdtab)
196  		call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
197  	/* coupled with smp_rmb() in fd_install() */
198  	smp_wmb();
199  	return 1;
200  }
201  
202  /*
203   * Expand files.
204   * This function will expand the file structures, if the requested size exceeds
205   * the current capacity and there is room for expansion.
206   * Return <0 error code on error; 0 when nothing done; 1 when files were
207   * expanded and execution may have blocked.
208   * The files->file_lock should be held on entry, and will be held on exit.
209   */
210  static int expand_files(struct files_struct *files, unsigned int nr)
211  	__releases(files->file_lock)
212  	__acquires(files->file_lock)
213  {
214  	struct fdtable *fdt;
215  	int expanded = 0;
216  
217  repeat:
218  	fdt = files_fdtable(files);
219  
220  	/* Do we need to expand? */
221  	if (nr < fdt->max_fds)
222  		return expanded;
223  
224  	/* Can we expand? */
225  	if (nr >= sysctl_nr_open)
226  		return -EMFILE;
227  
228  	if (unlikely(files->resize_in_progress)) {
229  		spin_unlock(&files->file_lock);
230  		expanded = 1;
231  		wait_event(files->resize_wait, !files->resize_in_progress);
232  		spin_lock(&files->file_lock);
233  		goto repeat;
234  	}
235  
236  	/* All good, so we try */
237  	files->resize_in_progress = true;
238  	expanded = expand_fdtable(files, nr);
239  	files->resize_in_progress = false;
240  
241  	wake_up_all(&files->resize_wait);
242  	return expanded;
243  }
244  
245  static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
246  {
247  	__set_bit(fd, fdt->close_on_exec);
248  }
249  
250  static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
251  {
252  	if (test_bit(fd, fdt->close_on_exec))
253  		__clear_bit(fd, fdt->close_on_exec);
254  }
255  
256  static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
257  {
258  	__set_bit(fd, fdt->open_fds);
259  	fd /= BITS_PER_LONG;
260  	if (!~fdt->open_fds[fd])
261  		__set_bit(fd, fdt->full_fds_bits);
262  }
263  
264  static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
265  {
266  	__clear_bit(fd, fdt->open_fds);
267  	__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
268  }
269  
270  static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
271  {
272  	return test_bit(fd, fdt->open_fds);
273  }
274  
275  /*
276   * Note that a sane fdtable size always has to be a multiple of
277   * BITS_PER_LONG, since we have bitmaps that are sized by this.
278   *
279   * punch_hole is optional - when close_range() is asked to unshare
280   * and close, we don't need to copy descriptors in that range, so
281   * a smaller cloned descriptor table might suffice if the last
282   * currently opened descriptor falls into that range.
283   */
284  static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole)
285  {
286  	unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds);
287  
288  	if (last == fdt->max_fds)
289  		return NR_OPEN_DEFAULT;
290  	if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) {
291  		last = find_last_bit(fdt->open_fds, punch_hole->from);
292  		if (last == punch_hole->from)
293  			return NR_OPEN_DEFAULT;
294  	}
295  	return ALIGN(last + 1, BITS_PER_LONG);
296  }
297  
298  /*
299   * Allocate a new descriptor table and copy contents from the passed in
300   * instance.  Returns a pointer to cloned table on success, ERR_PTR()
301   * on failure.  For 'punch_hole' see sane_fdtable_size().
302   */
303  struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole)
304  {
305  	struct files_struct *newf;
306  	struct file **old_fds, **new_fds;
307  	unsigned int open_files, i;
308  	struct fdtable *old_fdt, *new_fdt;
309  	int error;
310  
311  	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
312  	if (!newf)
313  		return ERR_PTR(-ENOMEM);
314  
315  	atomic_set(&newf->count, 1);
316  
317  	spin_lock_init(&newf->file_lock);
318  	newf->resize_in_progress = false;
319  	init_waitqueue_head(&newf->resize_wait);
320  	newf->next_fd = 0;
321  	new_fdt = &newf->fdtab;
322  	new_fdt->max_fds = NR_OPEN_DEFAULT;
323  	new_fdt->close_on_exec = newf->close_on_exec_init;
324  	new_fdt->open_fds = newf->open_fds_init;
325  	new_fdt->full_fds_bits = newf->full_fds_bits_init;
326  	new_fdt->fd = &newf->fd_array[0];
327  
328  	spin_lock(&oldf->file_lock);
329  	old_fdt = files_fdtable(oldf);
330  	open_files = sane_fdtable_size(old_fdt, punch_hole);
331  
332  	/*
333  	 * Check whether we need to allocate a larger fd array and fd set.
334  	 */
335  	while (unlikely(open_files > new_fdt->max_fds)) {
336  		spin_unlock(&oldf->file_lock);
337  
338  		if (new_fdt != &newf->fdtab)
339  			__free_fdtable(new_fdt);
340  
341  		new_fdt = alloc_fdtable(open_files - 1);
342  		if (!new_fdt) {
343  			error = -ENOMEM;
344  			goto out_release;
345  		}
346  
347  		/* beyond sysctl_nr_open; nothing to do */
348  		if (unlikely(new_fdt->max_fds < open_files)) {
349  			__free_fdtable(new_fdt);
350  			error = -EMFILE;
351  			goto out_release;
352  		}
353  
354  		/*
355  		 * Reacquire the oldf lock and a pointer to its fd table
356  		 * who knows it may have a new bigger fd table. We need
357  		 * the latest pointer.
358  		 */
359  		spin_lock(&oldf->file_lock);
360  		old_fdt = files_fdtable(oldf);
361  		open_files = sane_fdtable_size(old_fdt, punch_hole);
362  	}
363  
364  	copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
365  
366  	old_fds = old_fdt->fd;
367  	new_fds = new_fdt->fd;
368  
369  	for (i = open_files; i != 0; i--) {
370  		struct file *f = *old_fds++;
371  		if (f) {
372  			get_file(f);
373  		} else {
374  			/*
375  			 * The fd may be claimed in the fd bitmap but not yet
376  			 * instantiated in the files array if a sibling thread
377  			 * is partway through open().  So make sure that this
378  			 * fd is available to the new process.
379  			 */
380  			__clear_open_fd(open_files - i, new_fdt);
381  		}
382  		rcu_assign_pointer(*new_fds++, f);
383  	}
384  	spin_unlock(&oldf->file_lock);
385  
386  	/* clear the remainder */
387  	memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
388  
389  	rcu_assign_pointer(newf->fdt, new_fdt);
390  
391  	return newf;
392  
393  out_release:
394  	kmem_cache_free(files_cachep, newf);
395  	return ERR_PTR(error);
396  }
397  
398  static struct fdtable *close_files(struct files_struct * files)
399  {
400  	/*
401  	 * It is safe to dereference the fd table without RCU or
402  	 * ->file_lock because this is the last reference to the
403  	 * files structure.
404  	 */
405  	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
406  	unsigned int i, j = 0;
407  
408  	for (;;) {
409  		unsigned long set;
410  		i = j * BITS_PER_LONG;
411  		if (i >= fdt->max_fds)
412  			break;
413  		set = fdt->open_fds[j++];
414  		while (set) {
415  			if (set & 1) {
416  				struct file * file = xchg(&fdt->fd[i], NULL);
417  				if (file) {
418  					filp_close(file, files);
419  					cond_resched();
420  				}
421  			}
422  			i++;
423  			set >>= 1;
424  		}
425  	}
426  
427  	return fdt;
428  }
429  
430  void put_files_struct(struct files_struct *files)
431  {
432  	if (atomic_dec_and_test(&files->count)) {
433  		struct fdtable *fdt = close_files(files);
434  
435  		/* free the arrays if they are not embedded */
436  		if (fdt != &files->fdtab)
437  			__free_fdtable(fdt);
438  		kmem_cache_free(files_cachep, files);
439  	}
440  }
441  
442  void exit_files(struct task_struct *tsk)
443  {
444  	struct files_struct * files = tsk->files;
445  
446  	if (files) {
447  		task_lock(tsk);
448  		tsk->files = NULL;
449  		task_unlock(tsk);
450  		put_files_struct(files);
451  	}
452  }
453  
454  struct files_struct init_files = {
455  	.count		= ATOMIC_INIT(1),
456  	.fdt		= &init_files.fdtab,
457  	.fdtab		= {
458  		.max_fds	= NR_OPEN_DEFAULT,
459  		.fd		= &init_files.fd_array[0],
460  		.close_on_exec	= init_files.close_on_exec_init,
461  		.open_fds	= init_files.open_fds_init,
462  		.full_fds_bits	= init_files.full_fds_bits_init,
463  	},
464  	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
465  	.resize_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
466  };
467  
468  static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
469  {
470  	unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
471  	unsigned int maxbit = maxfd / BITS_PER_LONG;
472  	unsigned int bitbit = start / BITS_PER_LONG;
473  
474  	bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
475  	if (bitbit >= maxfd)
476  		return maxfd;
477  	if (bitbit > start)
478  		start = bitbit;
479  	return find_next_zero_bit(fdt->open_fds, maxfd, start);
480  }
481  
482  /*
483   * allocate a file descriptor, mark it busy.
484   */
485  static int alloc_fd(unsigned start, unsigned end, unsigned flags)
486  {
487  	struct files_struct *files = current->files;
488  	unsigned int fd;
489  	int error;
490  	struct fdtable *fdt;
491  
492  	spin_lock(&files->file_lock);
493  repeat:
494  	fdt = files_fdtable(files);
495  	fd = start;
496  	if (fd < files->next_fd)
497  		fd = files->next_fd;
498  
499  	if (fd < fdt->max_fds)
500  		fd = find_next_fd(fdt, fd);
501  
502  	/*
503  	 * N.B. For clone tasks sharing a files structure, this test
504  	 * will limit the total number of files that can be opened.
505  	 */
506  	error = -EMFILE;
507  	if (fd >= end)
508  		goto out;
509  
510  	error = expand_files(files, fd);
511  	if (error < 0)
512  		goto out;
513  
514  	/*
515  	 * If we needed to expand the fs array we
516  	 * might have blocked - try again.
517  	 */
518  	if (error)
519  		goto repeat;
520  
521  	if (start <= files->next_fd)
522  		files->next_fd = fd + 1;
523  
524  	__set_open_fd(fd, fdt);
525  	if (flags & O_CLOEXEC)
526  		__set_close_on_exec(fd, fdt);
527  	else
528  		__clear_close_on_exec(fd, fdt);
529  	error = fd;
530  #if 1
531  	/* Sanity check */
532  	if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
533  		printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
534  		rcu_assign_pointer(fdt->fd[fd], NULL);
535  	}
536  #endif
537  
538  out:
539  	spin_unlock(&files->file_lock);
540  	return error;
541  }
542  
543  int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
544  {
545  	return alloc_fd(0, nofile, flags);
546  }
547  
548  int get_unused_fd_flags(unsigned flags)
549  {
550  	return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
551  }
552  EXPORT_SYMBOL(get_unused_fd_flags);
553  
554  static void __put_unused_fd(struct files_struct *files, unsigned int fd)
555  {
556  	struct fdtable *fdt = files_fdtable(files);
557  	__clear_open_fd(fd, fdt);
558  	if (fd < files->next_fd)
559  		files->next_fd = fd;
560  }
561  
562  void put_unused_fd(unsigned int fd)
563  {
564  	struct files_struct *files = current->files;
565  	spin_lock(&files->file_lock);
566  	__put_unused_fd(files, fd);
567  	spin_unlock(&files->file_lock);
568  }
569  
570  EXPORT_SYMBOL(put_unused_fd);
571  
572  /*
573   * Install a file pointer in the fd array.
574   *
575   * The VFS is full of places where we drop the files lock between
576   * setting the open_fds bitmap and installing the file in the file
577   * array.  At any such point, we are vulnerable to a dup2() race
578   * installing a file in the array before us.  We need to detect this and
579   * fput() the struct file we are about to overwrite in this case.
580   *
581   * It should never happen - if we allow dup2() do it, _really_ bad things
582   * will follow.
583   *
584   * This consumes the "file" refcount, so callers should treat it
585   * as if they had called fput(file).
586   */
587  
588  void fd_install(unsigned int fd, struct file *file)
589  {
590  	struct files_struct *files = current->files;
591  	struct fdtable *fdt;
592  
593  	if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
594  		return;
595  
596  	rcu_read_lock_sched();
597  
598  	if (unlikely(files->resize_in_progress)) {
599  		rcu_read_unlock_sched();
600  		spin_lock(&files->file_lock);
601  		fdt = files_fdtable(files);
602  		BUG_ON(fdt->fd[fd] != NULL);
603  		rcu_assign_pointer(fdt->fd[fd], file);
604  		spin_unlock(&files->file_lock);
605  		return;
606  	}
607  	/* coupled with smp_wmb() in expand_fdtable() */
608  	smp_rmb();
609  	fdt = rcu_dereference_sched(files->fdt);
610  	BUG_ON(fdt->fd[fd] != NULL);
611  	rcu_assign_pointer(fdt->fd[fd], file);
612  	rcu_read_unlock_sched();
613  }
614  
615  EXPORT_SYMBOL(fd_install);
616  
617  /**
618   * file_close_fd_locked - return file associated with fd
619   * @files: file struct to retrieve file from
620   * @fd: file descriptor to retrieve file for
621   *
622   * Doesn't take a separate reference count.
623   *
624   * Context: files_lock must be held.
625   *
626   * Returns: The file associated with @fd (NULL if @fd is not open)
627   */
628  struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
629  {
630  	struct fdtable *fdt = files_fdtable(files);
631  	struct file *file;
632  
633  	lockdep_assert_held(&files->file_lock);
634  
635  	if (fd >= fdt->max_fds)
636  		return NULL;
637  
638  	fd = array_index_nospec(fd, fdt->max_fds);
639  	file = fdt->fd[fd];
640  	if (file) {
641  		rcu_assign_pointer(fdt->fd[fd], NULL);
642  		__put_unused_fd(files, fd);
643  	}
644  	return file;
645  }
646  
647  int close_fd(unsigned fd)
648  {
649  	struct files_struct *files = current->files;
650  	struct file *file;
651  
652  	spin_lock(&files->file_lock);
653  	file = file_close_fd_locked(files, fd);
654  	spin_unlock(&files->file_lock);
655  	if (!file)
656  		return -EBADF;
657  
658  	return filp_close(file, files);
659  }
660  EXPORT_SYMBOL(close_fd);
661  
662  /**
663   * last_fd - return last valid index into fd table
664   * @fdt: File descriptor table.
665   *
666   * Context: Either rcu read lock or files_lock must be held.
667   *
668   * Returns: Last valid index into fdtable.
669   */
670  static inline unsigned last_fd(struct fdtable *fdt)
671  {
672  	return fdt->max_fds - 1;
673  }
674  
675  static inline void __range_cloexec(struct files_struct *cur_fds,
676  				   unsigned int fd, unsigned int max_fd)
677  {
678  	struct fdtable *fdt;
679  
680  	/* make sure we're using the correct maximum value */
681  	spin_lock(&cur_fds->file_lock);
682  	fdt = files_fdtable(cur_fds);
683  	max_fd = min(last_fd(fdt), max_fd);
684  	if (fd <= max_fd)
685  		bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
686  	spin_unlock(&cur_fds->file_lock);
687  }
688  
689  static inline void __range_close(struct files_struct *files, unsigned int fd,
690  				 unsigned int max_fd)
691  {
692  	struct file *file;
693  	unsigned n;
694  
695  	spin_lock(&files->file_lock);
696  	n = last_fd(files_fdtable(files));
697  	max_fd = min(max_fd, n);
698  
699  	for (; fd <= max_fd; fd++) {
700  		file = file_close_fd_locked(files, fd);
701  		if (file) {
702  			spin_unlock(&files->file_lock);
703  			filp_close(file, files);
704  			cond_resched();
705  			spin_lock(&files->file_lock);
706  		} else if (need_resched()) {
707  			spin_unlock(&files->file_lock);
708  			cond_resched();
709  			spin_lock(&files->file_lock);
710  		}
711  	}
712  	spin_unlock(&files->file_lock);
713  }
714  
715  /**
716   * __close_range() - Close all file descriptors in a given range.
717   *
718   * @fd:     starting file descriptor to close
719   * @max_fd: last file descriptor to close
720   * @flags:  CLOSE_RANGE flags.
721   *
722   * This closes a range of file descriptors. All file descriptors
723   * from @fd up to and including @max_fd are closed.
724   */
725  int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
726  {
727  	struct task_struct *me = current;
728  	struct files_struct *cur_fds = me->files, *fds = NULL;
729  
730  	if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
731  		return -EINVAL;
732  
733  	if (fd > max_fd)
734  		return -EINVAL;
735  
736  	if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) {
737  		struct fd_range range = {fd, max_fd}, *punch_hole = &range;
738  
739  		/*
740  		 * If the caller requested all fds to be made cloexec we always
741  		 * copy all of the file descriptors since they still want to
742  		 * use them.
743  		 */
744  		if (flags & CLOSE_RANGE_CLOEXEC)
745  			punch_hole = NULL;
746  
747  		fds = dup_fd(cur_fds, punch_hole);
748  		if (IS_ERR(fds))
749  			return PTR_ERR(fds);
750  		/*
751  		 * We used to share our file descriptor table, and have now
752  		 * created a private one, make sure we're using it below.
753  		 */
754  		swap(cur_fds, fds);
755  	}
756  
757  	if (flags & CLOSE_RANGE_CLOEXEC)
758  		__range_cloexec(cur_fds, fd, max_fd);
759  	else
760  		__range_close(cur_fds, fd, max_fd);
761  
762  	if (fds) {
763  		/*
764  		 * We're done closing the files we were supposed to. Time to install
765  		 * the new file descriptor table and drop the old one.
766  		 */
767  		task_lock(me);
768  		me->files = cur_fds;
769  		task_unlock(me);
770  		put_files_struct(fds);
771  	}
772  
773  	return 0;
774  }
775  
776  /**
777   * file_close_fd - return file associated with fd
778   * @fd: file descriptor to retrieve file for
779   *
780   * Doesn't take a separate reference count.
781   *
782   * Returns: The file associated with @fd (NULL if @fd is not open)
783   */
784  struct file *file_close_fd(unsigned int fd)
785  {
786  	struct files_struct *files = current->files;
787  	struct file *file;
788  
789  	spin_lock(&files->file_lock);
790  	file = file_close_fd_locked(files, fd);
791  	spin_unlock(&files->file_lock);
792  
793  	return file;
794  }
795  
796  void do_close_on_exec(struct files_struct *files)
797  {
798  	unsigned i;
799  	struct fdtable *fdt;
800  
801  	/* exec unshares first */
802  	spin_lock(&files->file_lock);
803  	for (i = 0; ; i++) {
804  		unsigned long set;
805  		unsigned fd = i * BITS_PER_LONG;
806  		fdt = files_fdtable(files);
807  		if (fd >= fdt->max_fds)
808  			break;
809  		set = fdt->close_on_exec[i];
810  		if (!set)
811  			continue;
812  		fdt->close_on_exec[i] = 0;
813  		for ( ; set ; fd++, set >>= 1) {
814  			struct file *file;
815  			if (!(set & 1))
816  				continue;
817  			file = fdt->fd[fd];
818  			if (!file)
819  				continue;
820  			rcu_assign_pointer(fdt->fd[fd], NULL);
821  			__put_unused_fd(files, fd);
822  			spin_unlock(&files->file_lock);
823  			filp_close(file, files);
824  			cond_resched();
825  			spin_lock(&files->file_lock);
826  		}
827  
828  	}
829  	spin_unlock(&files->file_lock);
830  }
831  
832  static struct file *__get_file_rcu(struct file __rcu **f)
833  {
834  	struct file __rcu *file;
835  	struct file __rcu *file_reloaded;
836  	struct file __rcu *file_reloaded_cmp;
837  
838  	file = rcu_dereference_raw(*f);
839  	if (!file)
840  		return NULL;
841  
842  	if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
843  		return ERR_PTR(-EAGAIN);
844  
845  	file_reloaded = rcu_dereference_raw(*f);
846  
847  	/*
848  	 * Ensure that all accesses have a dependency on the load from
849  	 * rcu_dereference_raw() above so we get correct ordering
850  	 * between reuse/allocation and the pointer check below.
851  	 */
852  	file_reloaded_cmp = file_reloaded;
853  	OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
854  
855  	/*
856  	 * atomic_long_inc_not_zero() above provided a full memory
857  	 * barrier when we acquired a reference.
858  	 *
859  	 * This is paired with the write barrier from assigning to the
860  	 * __rcu protected file pointer so that if that pointer still
861  	 * matches the current file, we know we have successfully
862  	 * acquired a reference to the right file.
863  	 *
864  	 * If the pointers don't match the file has been reallocated by
865  	 * SLAB_TYPESAFE_BY_RCU.
866  	 */
867  	if (file == file_reloaded_cmp)
868  		return file_reloaded;
869  
870  	fput(file);
871  	return ERR_PTR(-EAGAIN);
872  }
873  
874  /**
875   * get_file_rcu - try go get a reference to a file under rcu
876   * @f: the file to get a reference on
877   *
878   * This function tries to get a reference on @f carefully verifying that
879   * @f hasn't been reused.
880   *
881   * This function should rarely have to be used and only by users who
882   * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
883   *
884   * Return: Returns @f with the reference count increased or NULL.
885   */
886  struct file *get_file_rcu(struct file __rcu **f)
887  {
888  	for (;;) {
889  		struct file __rcu *file;
890  
891  		file = __get_file_rcu(f);
892  		if (!IS_ERR(file))
893  			return file;
894  	}
895  }
896  EXPORT_SYMBOL_GPL(get_file_rcu);
897  
898  /**
899   * get_file_active - try go get a reference to a file
900   * @f: the file to get a reference on
901   *
902   * In contast to get_file_rcu() the pointer itself isn't part of the
903   * reference counting.
904   *
905   * This function should rarely have to be used and only by users who
906   * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
907   *
908   * Return: Returns @f with the reference count increased or NULL.
909   */
910  struct file *get_file_active(struct file **f)
911  {
912  	struct file __rcu *file;
913  
914  	rcu_read_lock();
915  	file = __get_file_rcu(f);
916  	rcu_read_unlock();
917  	if (IS_ERR(file))
918  		file = NULL;
919  	return file;
920  }
921  EXPORT_SYMBOL_GPL(get_file_active);
922  
923  static inline struct file *__fget_files_rcu(struct files_struct *files,
924         unsigned int fd, fmode_t mask)
925  {
926  	for (;;) {
927  		struct file *file;
928  		struct fdtable *fdt = rcu_dereference_raw(files->fdt);
929  		struct file __rcu **fdentry;
930  		unsigned long nospec_mask;
931  
932  		/* Mask is a 0 for invalid fd's, ~0 for valid ones */
933  		nospec_mask = array_index_mask_nospec(fd, fdt->max_fds);
934  
935  		/*
936  		 * fdentry points to the 'fd' offset, or fdt->fd[0].
937  		 * Loading from fdt->fd[0] is always safe, because the
938  		 * array always exists.
939  		 */
940  		fdentry = fdt->fd + (fd & nospec_mask);
941  
942  		/* Do the load, then mask any invalid result */
943  		file = rcu_dereference_raw(*fdentry);
944  		file = (void *)(nospec_mask & (unsigned long)file);
945  		if (unlikely(!file))
946  			return NULL;
947  
948  		/*
949  		 * Ok, we have a file pointer that was valid at
950  		 * some point, but it might have become stale since.
951  		 *
952  		 * We need to confirm it by incrementing the refcount
953  		 * and then check the lookup again.
954  		 *
955  		 * atomic_long_inc_not_zero() gives us a full memory
956  		 * barrier. We only really need an 'acquire' one to
957  		 * protect the loads below, but we don't have that.
958  		 */
959  		if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
960  			continue;
961  
962  		/*
963  		 * Such a race can take two forms:
964  		 *
965  		 *  (a) the file ref already went down to zero and the
966  		 *      file hasn't been reused yet or the file count
967  		 *      isn't zero but the file has already been reused.
968  		 *
969  		 *  (b) the file table entry has changed under us.
970  		 *       Note that we don't need to re-check the 'fdt->fd'
971  		 *       pointer having changed, because it always goes
972  		 *       hand-in-hand with 'fdt'.
973  		 *
974  		 * If so, we need to put our ref and try again.
975  		 */
976  		if (unlikely(file != rcu_dereference_raw(*fdentry)) ||
977  		    unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
978  			fput(file);
979  			continue;
980  		}
981  
982  		/*
983  		 * This isn't the file we're looking for or we're not
984  		 * allowed to get a reference to it.
985  		 */
986  		if (unlikely(file->f_mode & mask)) {
987  			fput(file);
988  			return NULL;
989  		}
990  
991  		/*
992  		 * Ok, we have a ref to the file, and checked that it
993  		 * still exists.
994  		 */
995  		return file;
996  	}
997  }
998  
999  static struct file *__fget_files(struct files_struct *files, unsigned int fd,
1000  				 fmode_t mask)
1001  {
1002  	struct file *file;
1003  
1004  	rcu_read_lock();
1005  	file = __fget_files_rcu(files, fd, mask);
1006  	rcu_read_unlock();
1007  
1008  	return file;
1009  }
1010  
1011  static inline struct file *__fget(unsigned int fd, fmode_t mask)
1012  {
1013  	return __fget_files(current->files, fd, mask);
1014  }
1015  
1016  struct file *fget(unsigned int fd)
1017  {
1018  	return __fget(fd, FMODE_PATH);
1019  }
1020  EXPORT_SYMBOL(fget);
1021  
1022  struct file *fget_raw(unsigned int fd)
1023  {
1024  	return __fget(fd, 0);
1025  }
1026  EXPORT_SYMBOL(fget_raw);
1027  
1028  struct file *fget_task(struct task_struct *task, unsigned int fd)
1029  {
1030  	struct file *file = NULL;
1031  
1032  	task_lock(task);
1033  	if (task->files)
1034  		file = __fget_files(task->files, fd, 0);
1035  	task_unlock(task);
1036  
1037  	return file;
1038  }
1039  
1040  struct file *lookup_fdget_rcu(unsigned int fd)
1041  {
1042  	return __fget_files_rcu(current->files, fd, 0);
1043  
1044  }
1045  EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
1046  
1047  struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
1048  {
1049  	/* Must be called with rcu_read_lock held */
1050  	struct files_struct *files;
1051  	struct file *file = NULL;
1052  
1053  	task_lock(task);
1054  	files = task->files;
1055  	if (files)
1056  		file = __fget_files_rcu(files, fd, 0);
1057  	task_unlock(task);
1058  
1059  	return file;
1060  }
1061  
1062  struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
1063  {
1064  	/* Must be called with rcu_read_lock held */
1065  	struct files_struct *files;
1066  	unsigned int fd = *ret_fd;
1067  	struct file *file = NULL;
1068  
1069  	task_lock(task);
1070  	files = task->files;
1071  	if (files) {
1072  		for (; fd < files_fdtable(files)->max_fds; fd++) {
1073  			file = __fget_files_rcu(files, fd, 0);
1074  			if (file)
1075  				break;
1076  		}
1077  	}
1078  	task_unlock(task);
1079  	*ret_fd = fd;
1080  	return file;
1081  }
1082  EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
1083  
1084  /*
1085   * Lightweight file lookup - no refcnt increment if fd table isn't shared.
1086   *
1087   * You can use this instead of fget if you satisfy all of the following
1088   * conditions:
1089   * 1) You must call fput_light before exiting the syscall and returning control
1090   *    to userspace (i.e. you cannot remember the returned struct file * after
1091   *    returning to userspace).
1092   * 2) You must not call filp_close on the returned struct file * in between
1093   *    calls to fget_light and fput_light.
1094   * 3) You must not clone the current task in between the calls to fget_light
1095   *    and fput_light.
1096   *
1097   * The fput_needed flag returned by fget_light should be passed to the
1098   * corresponding fput_light.
1099   */
1100  static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
1101  {
1102  	struct files_struct *files = current->files;
1103  	struct file *file;
1104  
1105  	/*
1106  	 * If another thread is concurrently calling close_fd() followed
1107  	 * by put_files_struct(), we must not observe the old table
1108  	 * entry combined with the new refcount - otherwise we could
1109  	 * return a file that is concurrently being freed.
1110  	 *
1111  	 * atomic_read_acquire() pairs with atomic_dec_and_test() in
1112  	 * put_files_struct().
1113  	 */
1114  	if (likely(atomic_read_acquire(&files->count) == 1)) {
1115  		file = files_lookup_fd_raw(files, fd);
1116  		if (!file || unlikely(file->f_mode & mask))
1117  			return EMPTY_FD;
1118  		return BORROWED_FD(file);
1119  	} else {
1120  		file = __fget_files(files, fd, mask);
1121  		if (!file)
1122  			return EMPTY_FD;
1123  		return CLONED_FD(file);
1124  	}
1125  }
1126  struct fd fdget(unsigned int fd)
1127  {
1128  	return __fget_light(fd, FMODE_PATH);
1129  }
1130  EXPORT_SYMBOL(fdget);
1131  
1132  struct fd fdget_raw(unsigned int fd)
1133  {
1134  	return __fget_light(fd, 0);
1135  }
1136  
1137  /*
1138   * Try to avoid f_pos locking. We only need it if the
1139   * file is marked for FMODE_ATOMIC_POS, and it can be
1140   * accessed multiple ways.
1141   *
1142   * Always do it for directories, because pidfd_getfd()
1143   * can make a file accessible even if it otherwise would
1144   * not be, and for directories this is a correctness
1145   * issue, not a "POSIX requirement".
1146   */
1147  static inline bool file_needs_f_pos_lock(struct file *file)
1148  {
1149  	return (file->f_mode & FMODE_ATOMIC_POS) &&
1150  		(file_count(file) > 1 || file->f_op->iterate_shared);
1151  }
1152  
1153  struct fd fdget_pos(unsigned int fd)
1154  {
1155  	struct fd f = fdget(fd);
1156  	struct file *file = fd_file(f);
1157  
1158  	if (file && file_needs_f_pos_lock(file)) {
1159  		f.word |= FDPUT_POS_UNLOCK;
1160  		mutex_lock(&file->f_pos_lock);
1161  	}
1162  	return f;
1163  }
1164  
1165  void __f_unlock_pos(struct file *f)
1166  {
1167  	mutex_unlock(&f->f_pos_lock);
1168  }
1169  
1170  /*
1171   * We only lock f_pos if we have threads or if the file might be
1172   * shared with another process. In both cases we'll have an elevated
1173   * file count (done either by fdget() or by fork()).
1174   */
1175  
1176  void set_close_on_exec(unsigned int fd, int flag)
1177  {
1178  	struct files_struct *files = current->files;
1179  	struct fdtable *fdt;
1180  	spin_lock(&files->file_lock);
1181  	fdt = files_fdtable(files);
1182  	if (flag)
1183  		__set_close_on_exec(fd, fdt);
1184  	else
1185  		__clear_close_on_exec(fd, fdt);
1186  	spin_unlock(&files->file_lock);
1187  }
1188  
1189  bool get_close_on_exec(unsigned int fd)
1190  {
1191  	bool res;
1192  	rcu_read_lock();
1193  	res = close_on_exec(fd, current->files);
1194  	rcu_read_unlock();
1195  	return res;
1196  }
1197  
1198  static int do_dup2(struct files_struct *files,
1199  	struct file *file, unsigned fd, unsigned flags)
1200  __releases(&files->file_lock)
1201  {
1202  	struct file *tofree;
1203  	struct fdtable *fdt;
1204  
1205  	/*
1206  	 * We need to detect attempts to do dup2() over allocated but still
1207  	 * not finished descriptor.  NB: OpenBSD avoids that at the price of
1208  	 * extra work in their equivalent of fget() - they insert struct
1209  	 * file immediately after grabbing descriptor, mark it larval if
1210  	 * more work (e.g. actual opening) is needed and make sure that
1211  	 * fget() treats larval files as absent.  Potentially interesting,
1212  	 * but while extra work in fget() is trivial, locking implications
1213  	 * and amount of surgery on open()-related paths in VFS are not.
1214  	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
1215  	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of
1216  	 * scope of POSIX or SUS, since neither considers shared descriptor
1217  	 * tables and this condition does not arise without those.
1218  	 */
1219  	fdt = files_fdtable(files);
1220  	fd = array_index_nospec(fd, fdt->max_fds);
1221  	tofree = fdt->fd[fd];
1222  	if (!tofree && fd_is_open(fd, fdt))
1223  		goto Ebusy;
1224  	get_file(file);
1225  	rcu_assign_pointer(fdt->fd[fd], file);
1226  	__set_open_fd(fd, fdt);
1227  	if (flags & O_CLOEXEC)
1228  		__set_close_on_exec(fd, fdt);
1229  	else
1230  		__clear_close_on_exec(fd, fdt);
1231  	spin_unlock(&files->file_lock);
1232  
1233  	if (tofree)
1234  		filp_close(tofree, files);
1235  
1236  	return fd;
1237  
1238  Ebusy:
1239  	spin_unlock(&files->file_lock);
1240  	return -EBUSY;
1241  }
1242  
1243  int replace_fd(unsigned fd, struct file *file, unsigned flags)
1244  {
1245  	int err;
1246  	struct files_struct *files = current->files;
1247  
1248  	if (!file)
1249  		return close_fd(fd);
1250  
1251  	if (fd >= rlimit(RLIMIT_NOFILE))
1252  		return -EBADF;
1253  
1254  	spin_lock(&files->file_lock);
1255  	err = expand_files(files, fd);
1256  	if (unlikely(err < 0))
1257  		goto out_unlock;
1258  	return do_dup2(files, file, fd, flags);
1259  
1260  out_unlock:
1261  	spin_unlock(&files->file_lock);
1262  	return err;
1263  }
1264  
1265  /**
1266   * receive_fd() - Install received file into file descriptor table
1267   * @file: struct file that was received from another process
1268   * @ufd: __user pointer to write new fd number to
1269   * @o_flags: the O_* flags to apply to the new fd entry
1270   *
1271   * Installs a received file into the file descriptor table, with appropriate
1272   * checks and count updates. Optionally writes the fd number to userspace, if
1273   * @ufd is non-NULL.
1274   *
1275   * This helper handles its own reference counting of the incoming
1276   * struct file.
1277   *
1278   * Returns newly install fd or -ve on error.
1279   */
1280  int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
1281  {
1282  	int new_fd;
1283  	int error;
1284  
1285  	error = security_file_receive(file);
1286  	if (error)
1287  		return error;
1288  
1289  	new_fd = get_unused_fd_flags(o_flags);
1290  	if (new_fd < 0)
1291  		return new_fd;
1292  
1293  	if (ufd) {
1294  		error = put_user(new_fd, ufd);
1295  		if (error) {
1296  			put_unused_fd(new_fd);
1297  			return error;
1298  		}
1299  	}
1300  
1301  	fd_install(new_fd, get_file(file));
1302  	__receive_sock(file);
1303  	return new_fd;
1304  }
1305  EXPORT_SYMBOL_GPL(receive_fd);
1306  
1307  int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
1308  {
1309  	int error;
1310  
1311  	error = security_file_receive(file);
1312  	if (error)
1313  		return error;
1314  	error = replace_fd(new_fd, file, o_flags);
1315  	if (error)
1316  		return error;
1317  	__receive_sock(file);
1318  	return new_fd;
1319  }
1320  
1321  static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
1322  {
1323  	int err = -EBADF;
1324  	struct file *file;
1325  	struct files_struct *files = current->files;
1326  
1327  	if ((flags & ~O_CLOEXEC) != 0)
1328  		return -EINVAL;
1329  
1330  	if (unlikely(oldfd == newfd))
1331  		return -EINVAL;
1332  
1333  	if (newfd >= rlimit(RLIMIT_NOFILE))
1334  		return -EBADF;
1335  
1336  	spin_lock(&files->file_lock);
1337  	err = expand_files(files, newfd);
1338  	file = files_lookup_fd_locked(files, oldfd);
1339  	if (unlikely(!file))
1340  		goto Ebadf;
1341  	if (unlikely(err < 0)) {
1342  		if (err == -EMFILE)
1343  			goto Ebadf;
1344  		goto out_unlock;
1345  	}
1346  	return do_dup2(files, file, newfd, flags);
1347  
1348  Ebadf:
1349  	err = -EBADF;
1350  out_unlock:
1351  	spin_unlock(&files->file_lock);
1352  	return err;
1353  }
1354  
1355  SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
1356  {
1357  	return ksys_dup3(oldfd, newfd, flags);
1358  }
1359  
1360  SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
1361  {
1362  	if (unlikely(newfd == oldfd)) { /* corner case */
1363  		struct files_struct *files = current->files;
1364  		struct file *f;
1365  		int retval = oldfd;
1366  
1367  		rcu_read_lock();
1368  		f = __fget_files_rcu(files, oldfd, 0);
1369  		if (!f)
1370  			retval = -EBADF;
1371  		rcu_read_unlock();
1372  		if (f)
1373  			fput(f);
1374  		return retval;
1375  	}
1376  	return ksys_dup3(oldfd, newfd, 0);
1377  }
1378  
1379  SYSCALL_DEFINE1(dup, unsigned int, fildes)
1380  {
1381  	int ret = -EBADF;
1382  	struct file *file = fget_raw(fildes);
1383  
1384  	if (file) {
1385  		ret = get_unused_fd_flags(0);
1386  		if (ret >= 0)
1387  			fd_install(ret, file);
1388  		else
1389  			fput(file);
1390  	}
1391  	return ret;
1392  }
1393  
1394  int f_dupfd(unsigned int from, struct file *file, unsigned flags)
1395  {
1396  	unsigned long nofile = rlimit(RLIMIT_NOFILE);
1397  	int err;
1398  	if (from >= nofile)
1399  		return -EINVAL;
1400  	err = alloc_fd(from, nofile, flags);
1401  	if (err >= 0) {
1402  		get_file(file);
1403  		fd_install(err, file);
1404  	}
1405  	return err;
1406  }
1407  
1408  int iterate_fd(struct files_struct *files, unsigned n,
1409  		int (*f)(const void *, struct file *, unsigned),
1410  		const void *p)
1411  {
1412  	struct fdtable *fdt;
1413  	int res = 0;
1414  	if (!files)
1415  		return 0;
1416  	spin_lock(&files->file_lock);
1417  	for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
1418  		struct file *file;
1419  		file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
1420  		if (!file)
1421  			continue;
1422  		res = f(p, file, n);
1423  		if (res)
1424  			break;
1425  	}
1426  	spin_unlock(&files->file_lock);
1427  	return res;
1428  }
1429  EXPORT_SYMBOL(iterate_fd);
1430