xref: /linux/fs/file.c (revision fe78e02600f83d81e55f6fc352d82c4f264a2901)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  linux/fs/file.c
4  *
5  *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
6  *
7  *  Manage the dynamic fd arrays in the process files_struct.
8  */
9 
10 #include <linux/syscalls.h>
11 #include <linux/export.h>
12 #include <linux/fs.h>
13 #include <linux/kernel.h>
14 #include <linux/mm.h>
15 #include <linux/sched/signal.h>
16 #include <linux/slab.h>
17 #include <linux/file.h>
18 #include <linux/fdtable.h>
19 #include <linux/bitops.h>
20 #include <linux/spinlock.h>
21 #include <linux/rcupdate.h>
22 #include <linux/close_range.h>
23 #include <linux/file_ref.h>
24 #include <net/sock.h>
25 #include <linux/init_task.h>
26 
27 #include "internal.h"
28 
__file_ref_put_badval(file_ref_t * ref,unsigned long cnt)29 static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
30 {
31 	/*
32 	 * If the reference count was already in the dead zone, then this
33 	 * put() operation is imbalanced. Warn, put the reference count back to
34 	 * DEAD and tell the caller to not deconstruct the object.
35 	 */
36 	if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
37 		atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
38 		return false;
39 	}
40 
41 	/*
42 	 * This is a put() operation on a saturated refcount. Restore the
43 	 * mean saturation value and tell the caller to not deconstruct the
44 	 * object.
45 	 */
46 	if (cnt > FILE_REF_MAXREF)
47 		atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
48 	return false;
49 }
50 
51 /**
52  * __file_ref_put - Slowpath of file_ref_put()
53  * @ref:	Pointer to the reference count
54  * @cnt:	Current reference count
55  *
56  * Invoked when the reference count is outside of the valid zone.
57  *
58  * Return:
59  *	True if this was the last reference with no future references
60  *	possible. This signals the caller that it can safely schedule the
61  *	object, which is protected by the reference counter, for
62  *	deconstruction.
63  *
64  *	False if there are still active references or the put() raced
65  *	with a concurrent get()/put() pair. Caller is not allowed to
66  *	deconstruct the protected object.
67  */
__file_ref_put(file_ref_t * ref,unsigned long cnt)68 bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
69 {
70 	/* Did this drop the last reference? */
71 	if (likely(cnt == FILE_REF_NOREF)) {
72 		/*
73 		 * Carefully try to set the reference count to FILE_REF_DEAD.
74 		 *
75 		 * This can fail if a concurrent get() operation has
76 		 * elevated it again or the corresponding put() even marked
77 		 * it dead already. Both are valid situations and do not
78 		 * require a retry. If this fails the caller is not
79 		 * allowed to deconstruct the object.
80 		 */
81 		if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
82 			return false;
83 
84 		/*
85 		 * The caller can safely schedule the object for
86 		 * deconstruction. Provide acquire ordering.
87 		 */
88 		smp_acquire__after_ctrl_dep();
89 		return true;
90 	}
91 
92 	return __file_ref_put_badval(ref, cnt);
93 }
94 EXPORT_SYMBOL_GPL(__file_ref_put);
95 
96 unsigned int sysctl_nr_open __read_mostly = 1024*1024;
97 unsigned int sysctl_nr_open_min = BITS_PER_LONG;
98 /* our min() is unusable in constant expressions ;-/ */
99 #define __const_min(x, y) ((x) < (y) ? (x) : (y))
100 unsigned int sysctl_nr_open_max =
101 	__const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;
102 
__free_fdtable(struct fdtable * fdt)103 static void __free_fdtable(struct fdtable *fdt)
104 {
105 	kvfree(fdt->fd);
106 	kvfree(fdt->open_fds);
107 	kfree(fdt);
108 }
109 
free_fdtable_rcu(struct rcu_head * rcu)110 static void free_fdtable_rcu(struct rcu_head *rcu)
111 {
112 	__free_fdtable(container_of(rcu, struct fdtable, rcu));
113 }
114 
115 #define BITBIT_NR(nr)	BITS_TO_LONGS(BITS_TO_LONGS(nr))
116 #define BITBIT_SIZE(nr)	(BITBIT_NR(nr) * sizeof(long))
117 
118 #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
119 /*
120  * Copy 'count' fd bits from the old table to the new table and clear the extra
121  * space if any.  This does not copy the file pointers.  Called with the files
122  * spinlock held for write.
123  */
copy_fd_bitmaps(struct fdtable * nfdt,struct fdtable * ofdt,unsigned int copy_words)124 static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
125 			    unsigned int copy_words)
126 {
127 	unsigned int nwords = fdt_words(nfdt);
128 
129 	bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
130 			copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
131 	bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
132 			copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
133 	bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
134 			copy_words, nwords);
135 }
136 
137 /*
138  * Copy all file descriptors from the old table to the new, expanded table and
139  * clear the extra space.  Called with the files spinlock held for write.
140  */
copy_fdtable(struct fdtable * nfdt,struct fdtable * ofdt)141 static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
142 {
143 	size_t cpy, set;
144 
145 	BUG_ON(nfdt->max_fds < ofdt->max_fds);
146 
147 	cpy = ofdt->max_fds * sizeof(struct file *);
148 	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
149 	memcpy(nfdt->fd, ofdt->fd, cpy);
150 	memset((char *)nfdt->fd + cpy, 0, set);
151 
152 	copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
153 }
154 
155 /*
156  * Note how the fdtable bitmap allocations very much have to be a multiple of
157  * BITS_PER_LONG. This is not only because we walk those things in chunks of
158  * 'unsigned long' in some places, but simply because that is how the Linux
159  * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
160  * they are very much "bits in an array of unsigned long".
161  */
alloc_fdtable(unsigned int slots_wanted)162 static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
163 {
164 	struct fdtable *fdt;
165 	unsigned int nr;
166 	void *data;
167 
168 	/*
169 	 * Figure out how many fds we actually want to support in this fdtable.
170 	 * Allocation steps are keyed to the size of the fdarray, since it
171 	 * grows far faster than any of the other dynamic data. We try to fit
172 	 * the fdarray into comfortable page-tuned chunks: starting at 1024B
173 	 * and growing in powers of two from there on.  Since we called only
174 	 * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab
175 	 * already gives BITS_PER_LONG slots), the above boils down to
176 	 * 1.  use the smallest power of two large enough to give us that many
177 	 * slots.
178 	 * 2.  on 32bit skip 64 and 128 - the minimal capacity we want there is
179 	 * 256 slots (i.e. 1Kb fd array).
180 	 * 3.  on 64bit don't skip anything, 1Kb fd array means 128 slots there
181 	 * and we are never going to be asked for 64 or less.
182 	 */
183 	if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256)
184 		nr = 256;
185 	else
186 		nr = roundup_pow_of_two(slots_wanted);
187 	/*
188 	 * Note that this can drive nr *below* what we had passed if sysctl_nr_open
189 	 * had been set lower between the check in expand_files() and here.
190 	 *
191 	 * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
192 	 * bitmaps handling below becomes unpleasant, to put it mildly...
193 	 */
194 	if (unlikely(nr > sysctl_nr_open)) {
195 		nr = round_down(sysctl_nr_open, BITS_PER_LONG);
196 		if (nr < slots_wanted)
197 			return ERR_PTR(-EMFILE);
198 	}
199 
200 	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
201 	if (!fdt)
202 		goto out;
203 	fdt->max_fds = nr;
204 	data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
205 	if (!data)
206 		goto out_fdt;
207 	fdt->fd = data;
208 
209 	data = kvmalloc(max_t(size_t,
210 				 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
211 				 GFP_KERNEL_ACCOUNT);
212 	if (!data)
213 		goto out_arr;
214 	fdt->open_fds = data;
215 	data += nr / BITS_PER_BYTE;
216 	fdt->close_on_exec = data;
217 	data += nr / BITS_PER_BYTE;
218 	fdt->full_fds_bits = data;
219 
220 	return fdt;
221 
222 out_arr:
223 	kvfree(fdt->fd);
224 out_fdt:
225 	kfree(fdt);
226 out:
227 	return ERR_PTR(-ENOMEM);
228 }
229 
230 /*
231  * Expand the file descriptor table.
232  * This function will allocate a new fdtable and both fd array and fdset, of
233  * the given size.
234  * Return <0 error code on error; 0 on successful completion.
235  * The files->file_lock should be held on entry, and will be held on exit.
236  */
expand_fdtable(struct files_struct * files,unsigned int nr)237 static int expand_fdtable(struct files_struct *files, unsigned int nr)
238 	__releases(files->file_lock)
239 	__acquires(files->file_lock)
240 {
241 	struct fdtable *new_fdt, *cur_fdt;
242 
243 	spin_unlock(&files->file_lock);
244 	new_fdt = alloc_fdtable(nr + 1);
245 
246 	/* make sure all fd_install() have seen resize_in_progress
247 	 * or have finished their rcu_read_lock_sched() section.
248 	 */
249 	if (atomic_read(&files->count) > 1)
250 		synchronize_rcu();
251 
252 	spin_lock(&files->file_lock);
253 	if (IS_ERR(new_fdt))
254 		return PTR_ERR(new_fdt);
255 	cur_fdt = files_fdtable(files);
256 	BUG_ON(nr < cur_fdt->max_fds);
257 	copy_fdtable(new_fdt, cur_fdt);
258 	rcu_assign_pointer(files->fdt, new_fdt);
259 	if (cur_fdt != &files->fdtab)
260 		call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
261 	/* coupled with smp_rmb() in fd_install() */
262 	smp_wmb();
263 	return 0;
264 }
265 
266 /*
267  * Expand files.
268  * This function will expand the file structures, if the requested size exceeds
269  * the current capacity and there is room for expansion.
270  * Return <0 error code on error; 0 on success.
271  * The files->file_lock should be held on entry, and will be held on exit.
272  */
expand_files(struct files_struct * files,unsigned int nr)273 static int expand_files(struct files_struct *files, unsigned int nr)
274 	__releases(files->file_lock)
275 	__acquires(files->file_lock)
276 {
277 	struct fdtable *fdt;
278 	int error;
279 
280 repeat:
281 	fdt = files_fdtable(files);
282 
283 	/* Do we need to expand? */
284 	if (nr < fdt->max_fds)
285 		return 0;
286 
287 	if (unlikely(files->resize_in_progress)) {
288 		spin_unlock(&files->file_lock);
289 		wait_event(files->resize_wait, !files->resize_in_progress);
290 		spin_lock(&files->file_lock);
291 		goto repeat;
292 	}
293 
294 	/* Can we expand? */
295 	if (unlikely(nr >= sysctl_nr_open))
296 		return -EMFILE;
297 
298 	/* All good, so we try */
299 	files->resize_in_progress = true;
300 	error = expand_fdtable(files, nr);
301 	files->resize_in_progress = false;
302 
303 	wake_up_all(&files->resize_wait);
304 	return error;
305 }
306 
__set_close_on_exec(unsigned int fd,struct fdtable * fdt,bool set)307 static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt,
308 				       bool set)
309 {
310 	if (set) {
311 		__set_bit(fd, fdt->close_on_exec);
312 	} else {
313 		if (test_bit(fd, fdt->close_on_exec))
314 			__clear_bit(fd, fdt->close_on_exec);
315 	}
316 }
317 
__set_open_fd(unsigned int fd,struct fdtable * fdt,bool set)318 static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)
319 {
320 	__set_bit(fd, fdt->open_fds);
321 	__set_close_on_exec(fd, fdt, set);
322 	fd /= BITS_PER_LONG;
323 	if (!~fdt->open_fds[fd])
324 		__set_bit(fd, fdt->full_fds_bits);
325 }
326 
__clear_open_fd(unsigned int fd,struct fdtable * fdt)327 static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
328 {
329 	__clear_bit(fd, fdt->open_fds);
330 	fd /= BITS_PER_LONG;
331 	if (test_bit(fd, fdt->full_fds_bits))
332 		__clear_bit(fd, fdt->full_fds_bits);
333 }
334 
fd_is_open(unsigned int fd,const struct fdtable * fdt)335 static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
336 {
337 	return test_bit(fd, fdt->open_fds);
338 }
339 
340 /*
341  * Note that a sane fdtable size always has to be a multiple of
342  * BITS_PER_LONG, since we have bitmaps that are sized by this.
343  *
344  * punch_hole is optional - when close_range() is asked to unshare
345  * and close, we don't need to copy descriptors in that range, so
346  * a smaller cloned descriptor table might suffice if the last
347  * currently opened descriptor falls into that range.
348  */
sane_fdtable_size(struct fdtable * fdt,struct fd_range * punch_hole)349 static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole)
350 {
351 	unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds);
352 
353 	if (last == fdt->max_fds)
354 		return NR_OPEN_DEFAULT;
355 	if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) {
356 		last = find_last_bit(fdt->open_fds, punch_hole->from);
357 		if (last == punch_hole->from)
358 			return NR_OPEN_DEFAULT;
359 	}
360 	return ALIGN(last + 1, BITS_PER_LONG);
361 }
362 
363 /*
364  * Allocate a new descriptor table and copy contents from the passed in
365  * instance.  Returns a pointer to cloned table on success, ERR_PTR()
366  * on failure.  For 'punch_hole' see sane_fdtable_size().
367  */
dup_fd(struct files_struct * oldf,struct fd_range * punch_hole)368 struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole)
369 {
370 	struct files_struct *newf;
371 	struct file **old_fds, **new_fds;
372 	unsigned int open_files, i;
373 	struct fdtable *old_fdt, *new_fdt;
374 
375 	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
376 	if (!newf)
377 		return ERR_PTR(-ENOMEM);
378 
379 	atomic_set(&newf->count, 1);
380 
381 	spin_lock_init(&newf->file_lock);
382 	newf->resize_in_progress = false;
383 	init_waitqueue_head(&newf->resize_wait);
384 	newf->next_fd = 0;
385 	new_fdt = &newf->fdtab;
386 	new_fdt->max_fds = NR_OPEN_DEFAULT;
387 	new_fdt->close_on_exec = newf->close_on_exec_init;
388 	new_fdt->open_fds = newf->open_fds_init;
389 	new_fdt->full_fds_bits = newf->full_fds_bits_init;
390 	new_fdt->fd = &newf->fd_array[0];
391 
392 	spin_lock(&oldf->file_lock);
393 	old_fdt = files_fdtable(oldf);
394 	open_files = sane_fdtable_size(old_fdt, punch_hole);
395 
396 	/*
397 	 * Check whether we need to allocate a larger fd array and fd set.
398 	 */
399 	while (unlikely(open_files > new_fdt->max_fds)) {
400 		spin_unlock(&oldf->file_lock);
401 
402 		if (new_fdt != &newf->fdtab)
403 			__free_fdtable(new_fdt);
404 
405 		new_fdt = alloc_fdtable(open_files);
406 		if (IS_ERR(new_fdt)) {
407 			kmem_cache_free(files_cachep, newf);
408 			return ERR_CAST(new_fdt);
409 		}
410 
411 		/*
412 		 * Reacquire the oldf lock and a pointer to its fd table
413 		 * who knows it may have a new bigger fd table. We need
414 		 * the latest pointer.
415 		 */
416 		spin_lock(&oldf->file_lock);
417 		old_fdt = files_fdtable(oldf);
418 		open_files = sane_fdtable_size(old_fdt, punch_hole);
419 	}
420 
421 	copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
422 
423 	old_fds = old_fdt->fd;
424 	new_fds = new_fdt->fd;
425 
426 	/*
427 	 * We may be racing against fd allocation from other threads using this
428 	 * files_struct, despite holding ->file_lock.
429 	 *
430 	 * alloc_fd() might have already claimed a slot, while fd_install()
431 	 * did not populate it yet. Note the latter operates locklessly, so
432 	 * the file can show up as we are walking the array below.
433 	 *
434 	 * At the same time we know no files will disappear as all other
435 	 * operations take the lock.
436 	 *
437 	 * Instead of trying to placate userspace racing with itself, we
438 	 * ref the file if we see it and mark the fd slot as unused otherwise.
439 	 */
440 	for (i = open_files; i != 0; i--) {
441 		struct file *f = rcu_dereference_raw(*old_fds++);
442 		if (f) {
443 			get_file(f);
444 		} else {
445 			__clear_open_fd(open_files - i, new_fdt);
446 		}
447 		rcu_assign_pointer(*new_fds++, f);
448 	}
449 	spin_unlock(&oldf->file_lock);
450 
451 	/* clear the remainder */
452 	memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
453 
454 	rcu_assign_pointer(newf->fdt, new_fdt);
455 
456 	return newf;
457 }
458 
close_files(struct files_struct * files)459 static struct fdtable *close_files(struct files_struct * files)
460 {
461 	/*
462 	 * It is safe to dereference the fd table without RCU or
463 	 * ->file_lock because this is the last reference to the
464 	 * files structure.
465 	 */
466 	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
467 	unsigned int i, j = 0;
468 
469 	for (;;) {
470 		unsigned long set;
471 		i = j * BITS_PER_LONG;
472 		if (i >= fdt->max_fds)
473 			break;
474 		set = fdt->open_fds[j++];
475 		while (set) {
476 			if (set & 1) {
477 				struct file *file = fdt->fd[i];
478 				if (file) {
479 					filp_close(file, files);
480 					cond_resched();
481 				}
482 			}
483 			i++;
484 			set >>= 1;
485 		}
486 	}
487 
488 	return fdt;
489 }
490 
put_files_struct(struct files_struct * files)491 void put_files_struct(struct files_struct *files)
492 {
493 	if (atomic_dec_and_test(&files->count)) {
494 		struct fdtable *fdt = close_files(files);
495 
496 		/* free the arrays if they are not embedded */
497 		if (fdt != &files->fdtab)
498 			__free_fdtable(fdt);
499 		kmem_cache_free(files_cachep, files);
500 	}
501 }
502 
exit_files(struct task_struct * tsk)503 void exit_files(struct task_struct *tsk)
504 {
505 	struct files_struct * files = tsk->files;
506 
507 	if (files) {
508 		task_lock(tsk);
509 		tsk->files = NULL;
510 		task_unlock(tsk);
511 		put_files_struct(files);
512 	}
513 }
514 
515 struct files_struct init_files = {
516 	.count		= ATOMIC_INIT(1),
517 	.fdt		= &init_files.fdtab,
518 	.fdtab		= {
519 		.max_fds	= NR_OPEN_DEFAULT,
520 		.fd		= &init_files.fd_array[0],
521 		.close_on_exec	= init_files.close_on_exec_init,
522 		.open_fds	= init_files.open_fds_init,
523 		.full_fds_bits	= init_files.full_fds_bits_init,
524 	},
525 	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
526 	.resize_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
527 };
528 
find_next_fd(struct fdtable * fdt,unsigned int start)529 static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
530 {
531 	unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
532 	unsigned int maxbit = maxfd / BITS_PER_LONG;
533 	unsigned int bitbit = start / BITS_PER_LONG;
534 	unsigned int bit;
535 
536 	/*
537 	 * Try to avoid looking at the second level bitmap
538 	 */
539 	bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,
540 				 start & (BITS_PER_LONG - 1));
541 	if (bit < BITS_PER_LONG)
542 		return bit + bitbit * BITS_PER_LONG;
543 
544 	bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
545 	if (bitbit >= maxfd)
546 		return maxfd;
547 	if (bitbit > start)
548 		start = bitbit;
549 	return find_next_zero_bit(fdt->open_fds, maxfd, start);
550 }
551 
552 /*
553  * allocate a file descriptor, mark it busy.
554  */
alloc_fd(unsigned start,unsigned end,unsigned flags)555 static int alloc_fd(unsigned start, unsigned end, unsigned flags)
556 {
557 	struct files_struct *files = current->files;
558 	unsigned int fd;
559 	int error;
560 	struct fdtable *fdt;
561 
562 	spin_lock(&files->file_lock);
563 repeat:
564 	fdt = files_fdtable(files);
565 	fd = start;
566 	if (fd < files->next_fd)
567 		fd = files->next_fd;
568 
569 	if (likely(fd < fdt->max_fds))
570 		fd = find_next_fd(fdt, fd);
571 
572 	/*
573 	 * N.B. For clone tasks sharing a files structure, this test
574 	 * will limit the total number of files that can be opened.
575 	 */
576 	error = -EMFILE;
577 	if (unlikely(fd >= end))
578 		goto out;
579 
580 	if (unlikely(fd >= fdt->max_fds)) {
581 		error = expand_files(files, fd);
582 		if (error < 0)
583 			goto out;
584 
585 		goto repeat;
586 	}
587 
588 	if (start <= files->next_fd)
589 		files->next_fd = fd + 1;
590 
591 	__set_open_fd(fd, fdt, flags & O_CLOEXEC);
592 	error = fd;
593 	VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
594 
595 out:
596 	spin_unlock(&files->file_lock);
597 	return error;
598 }
599 
__get_unused_fd_flags(unsigned flags,unsigned long nofile)600 int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
601 {
602 	return alloc_fd(0, nofile, flags);
603 }
604 
get_unused_fd_flags(unsigned flags)605 int get_unused_fd_flags(unsigned flags)
606 {
607 	return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
608 }
609 EXPORT_SYMBOL(get_unused_fd_flags);
610 
__put_unused_fd(struct files_struct * files,unsigned int fd)611 static void __put_unused_fd(struct files_struct *files, unsigned int fd)
612 {
613 	struct fdtable *fdt = files_fdtable(files);
614 	__clear_open_fd(fd, fdt);
615 	if (fd < files->next_fd)
616 		files->next_fd = fd;
617 }
618 
put_unused_fd(unsigned int fd)619 void put_unused_fd(unsigned int fd)
620 {
621 	struct files_struct *files = current->files;
622 	spin_lock(&files->file_lock);
623 	__put_unused_fd(files, fd);
624 	spin_unlock(&files->file_lock);
625 }
626 
627 EXPORT_SYMBOL(put_unused_fd);
628 
629 /**
630  * fd_install - install a file pointer in the fd array
631  * @fd: file descriptor to install the file in
632  * @file: the file to install
633  *
634  * This consumes the "file" refcount, so callers should treat it
635  * as if they had called fput(file).
636  */
fd_install(unsigned int fd,struct file * file)637 void fd_install(unsigned int fd, struct file *file)
638 {
639 	struct files_struct *files = current->files;
640 	struct fdtable *fdt;
641 
642 	if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
643 		return;
644 
645 	rcu_read_lock_sched();
646 
647 	if (unlikely(files->resize_in_progress)) {
648 		rcu_read_unlock_sched();
649 		spin_lock(&files->file_lock);
650 		fdt = files_fdtable(files);
651 		VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
652 		rcu_assign_pointer(fdt->fd[fd], file);
653 		spin_unlock(&files->file_lock);
654 		return;
655 	}
656 	/* coupled with smp_wmb() in expand_fdtable() */
657 	smp_rmb();
658 	fdt = rcu_dereference_sched(files->fdt);
659 	VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
660 	rcu_assign_pointer(fdt->fd[fd], file);
661 	rcu_read_unlock_sched();
662 }
663 
664 EXPORT_SYMBOL(fd_install);
665 
666 /**
667  * file_close_fd_locked - return file associated with fd
668  * @files: file struct to retrieve file from
669  * @fd: file descriptor to retrieve file for
670  *
671  * Doesn't take a separate reference count.
672  *
673  * Context: files_lock must be held.
674  *
675  * Returns: The file associated with @fd (NULL if @fd is not open)
676  */
file_close_fd_locked(struct files_struct * files,unsigned fd)677 struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
678 {
679 	struct fdtable *fdt = files_fdtable(files);
680 	struct file *file;
681 
682 	lockdep_assert_held(&files->file_lock);
683 
684 	if (fd >= fdt->max_fds)
685 		return NULL;
686 
687 	fd = array_index_nospec(fd, fdt->max_fds);
688 	file = rcu_dereference_raw(fdt->fd[fd]);
689 	if (file) {
690 		rcu_assign_pointer(fdt->fd[fd], NULL);
691 		__put_unused_fd(files, fd);
692 	}
693 	return file;
694 }
695 
close_fd(unsigned fd)696 int close_fd(unsigned fd)
697 {
698 	struct files_struct *files = current->files;
699 	struct file *file;
700 
701 	spin_lock(&files->file_lock);
702 	file = file_close_fd_locked(files, fd);
703 	spin_unlock(&files->file_lock);
704 	if (!file)
705 		return -EBADF;
706 
707 	return filp_close(file, files);
708 }
709 EXPORT_SYMBOL(close_fd);
710 
711 /**
712  * last_fd - return last valid index into fd table
713  * @fdt: File descriptor table.
714  *
715  * Context: Either rcu read lock or files_lock must be held.
716  *
717  * Returns: Last valid index into fdtable.
718  */
last_fd(struct fdtable * fdt)719 static inline unsigned last_fd(struct fdtable *fdt)
720 {
721 	return fdt->max_fds - 1;
722 }
723 
__range_cloexec(struct files_struct * cur_fds,unsigned int fd,unsigned int max_fd)724 static inline void __range_cloexec(struct files_struct *cur_fds,
725 				   unsigned int fd, unsigned int max_fd)
726 {
727 	struct fdtable *fdt;
728 
729 	/* make sure we're using the correct maximum value */
730 	spin_lock(&cur_fds->file_lock);
731 	fdt = files_fdtable(cur_fds);
732 	max_fd = min(last_fd(fdt), max_fd);
733 	if (fd <= max_fd)
734 		bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
735 	spin_unlock(&cur_fds->file_lock);
736 }
737 
__range_close(struct files_struct * files,unsigned int fd,unsigned int max_fd)738 static inline void __range_close(struct files_struct *files, unsigned int fd,
739 				 unsigned int max_fd)
740 {
741 	struct file *file;
742 	unsigned n;
743 
744 	spin_lock(&files->file_lock);
745 	n = last_fd(files_fdtable(files));
746 	max_fd = min(max_fd, n);
747 
748 	for (; fd <= max_fd; fd++) {
749 		file = file_close_fd_locked(files, fd);
750 		if (file) {
751 			spin_unlock(&files->file_lock);
752 			filp_close(file, files);
753 			cond_resched();
754 			spin_lock(&files->file_lock);
755 		} else if (need_resched()) {
756 			spin_unlock(&files->file_lock);
757 			cond_resched();
758 			spin_lock(&files->file_lock);
759 		}
760 	}
761 	spin_unlock(&files->file_lock);
762 }
763 
764 /**
765  * sys_close_range() - Close all file descriptors in a given range.
766  *
767  * @fd:     starting file descriptor to close
768  * @max_fd: last file descriptor to close
769  * @flags:  CLOSE_RANGE flags.
770  *
771  * This closes a range of file descriptors. All file descriptors
772  * from @fd up to and including @max_fd are closed.
773  * Currently, errors to close a given file descriptor are ignored.
774  */
SYSCALL_DEFINE3(close_range,unsigned int,fd,unsigned int,max_fd,unsigned int,flags)775 SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
776 		unsigned int, flags)
777 {
778 	struct task_struct *me = current;
779 	struct files_struct *cur_fds = me->files, *fds = NULL;
780 
781 	if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
782 		return -EINVAL;
783 
784 	if (fd > max_fd)
785 		return -EINVAL;
786 
787 	if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) {
788 		struct fd_range range = {fd, max_fd}, *punch_hole = &range;
789 
790 		/*
791 		 * If the caller requested all fds to be made cloexec we always
792 		 * copy all of the file descriptors since they still want to
793 		 * use them.
794 		 */
795 		if (flags & CLOSE_RANGE_CLOEXEC)
796 			punch_hole = NULL;
797 
798 		fds = dup_fd(cur_fds, punch_hole);
799 		if (IS_ERR(fds))
800 			return PTR_ERR(fds);
801 		/*
802 		 * We used to share our file descriptor table, and have now
803 		 * created a private one, make sure we're using it below.
804 		 */
805 		swap(cur_fds, fds);
806 	}
807 
808 	if (flags & CLOSE_RANGE_CLOEXEC)
809 		__range_cloexec(cur_fds, fd, max_fd);
810 	else
811 		__range_close(cur_fds, fd, max_fd);
812 
813 	if (fds) {
814 		/*
815 		 * We're done closing the files we were supposed to. Time to install
816 		 * the new file descriptor table and drop the old one.
817 		 */
818 		task_lock(me);
819 		me->files = cur_fds;
820 		task_unlock(me);
821 		put_files_struct(fds);
822 	}
823 
824 	return 0;
825 }
826 
827 /**
828  * file_close_fd - return file associated with fd
829  * @fd: file descriptor to retrieve file for
830  *
831  * Doesn't take a separate reference count.
832  *
833  * Returns: The file associated with @fd (NULL if @fd is not open)
834  */
file_close_fd(unsigned int fd)835 struct file *file_close_fd(unsigned int fd)
836 {
837 	struct files_struct *files = current->files;
838 	struct file *file;
839 
840 	spin_lock(&files->file_lock);
841 	file = file_close_fd_locked(files, fd);
842 	spin_unlock(&files->file_lock);
843 
844 	return file;
845 }
846 
do_close_on_exec(struct files_struct * files)847 void do_close_on_exec(struct files_struct *files)
848 {
849 	unsigned i;
850 	struct fdtable *fdt;
851 
852 	/* exec unshares first */
853 	spin_lock(&files->file_lock);
854 	for (i = 0; ; i++) {
855 		unsigned long set;
856 		unsigned fd = i * BITS_PER_LONG;
857 		fdt = files_fdtable(files);
858 		if (fd >= fdt->max_fds)
859 			break;
860 		set = fdt->close_on_exec[i];
861 		if (!set)
862 			continue;
863 		fdt->close_on_exec[i] = 0;
864 		for ( ; set ; fd++, set >>= 1) {
865 			struct file *file;
866 			if (!(set & 1))
867 				continue;
868 			file = fdt->fd[fd];
869 			if (!file)
870 				continue;
871 			rcu_assign_pointer(fdt->fd[fd], NULL);
872 			__put_unused_fd(files, fd);
873 			spin_unlock(&files->file_lock);
874 			filp_close(file, files);
875 			cond_resched();
876 			spin_lock(&files->file_lock);
877 		}
878 
879 	}
880 	spin_unlock(&files->file_lock);
881 }
882 
__get_file_rcu(struct file __rcu ** f)883 static struct file *__get_file_rcu(struct file __rcu **f)
884 {
885 	struct file __rcu *file;
886 	struct file __rcu *file_reloaded;
887 	struct file __rcu *file_reloaded_cmp;
888 
889 	file = rcu_dereference_raw(*f);
890 	if (!file)
891 		return NULL;
892 
893 	if (unlikely(!file_ref_get(&file->f_ref)))
894 		return ERR_PTR(-EAGAIN);
895 
896 	file_reloaded = rcu_dereference_raw(*f);
897 
898 	/*
899 	 * Ensure that all accesses have a dependency on the load from
900 	 * rcu_dereference_raw() above so we get correct ordering
901 	 * between reuse/allocation and the pointer check below.
902 	 */
903 	file_reloaded_cmp = file_reloaded;
904 	OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
905 
906 	/*
907 	 * file_ref_get() above provided a full memory barrier when we
908 	 * acquired a reference.
909 	 *
910 	 * This is paired with the write barrier from assigning to the
911 	 * __rcu protected file pointer so that if that pointer still
912 	 * matches the current file, we know we have successfully
913 	 * acquired a reference to the right file.
914 	 *
915 	 * If the pointers don't match the file has been reallocated by
916 	 * SLAB_TYPESAFE_BY_RCU.
917 	 */
918 	if (file == file_reloaded_cmp)
919 		return file_reloaded;
920 
921 	fput(file);
922 	return ERR_PTR(-EAGAIN);
923 }
924 
925 /**
926  * get_file_rcu - try go get a reference to a file under rcu
927  * @f: the file to get a reference on
928  *
929  * This function tries to get a reference on @f carefully verifying that
930  * @f hasn't been reused.
931  *
932  * This function should rarely have to be used and only by users who
933  * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
934  *
935  * Return: Returns @f with the reference count increased or NULL.
936  */
get_file_rcu(struct file __rcu ** f)937 struct file *get_file_rcu(struct file __rcu **f)
938 {
939 	for (;;) {
940 		struct file __rcu *file;
941 
942 		file = __get_file_rcu(f);
943 		if (!IS_ERR(file))
944 			return file;
945 	}
946 }
947 EXPORT_SYMBOL_GPL(get_file_rcu);
948 
949 /**
950  * get_file_active - try go get a reference to a file
951  * @f: the file to get a reference on
952  *
953  * In contast to get_file_rcu() the pointer itself isn't part of the
954  * reference counting.
955  *
956  * This function should rarely have to be used and only by users who
957  * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
958  *
959  * Return: Returns @f with the reference count increased or NULL.
960  */
get_file_active(struct file ** f)961 struct file *get_file_active(struct file **f)
962 {
963 	struct file __rcu *file;
964 
965 	rcu_read_lock();
966 	file = __get_file_rcu(f);
967 	rcu_read_unlock();
968 	if (IS_ERR(file))
969 		file = NULL;
970 	return file;
971 }
972 EXPORT_SYMBOL_GPL(get_file_active);
973 
__fget_files_rcu(struct files_struct * files,unsigned int fd,fmode_t mask)974 static inline struct file *__fget_files_rcu(struct files_struct *files,
975        unsigned int fd, fmode_t mask)
976 {
977 	for (;;) {
978 		struct file *file;
979 		struct fdtable *fdt = rcu_dereference_raw(files->fdt);
980 		struct file __rcu **fdentry;
981 		unsigned long nospec_mask;
982 
983 		/* Mask is a 0 for invalid fd's, ~0 for valid ones */
984 		nospec_mask = array_index_mask_nospec(fd, fdt->max_fds);
985 
986 		/*
987 		 * fdentry points to the 'fd' offset, or fdt->fd[0].
988 		 * Loading from fdt->fd[0] is always safe, because the
989 		 * array always exists.
990 		 */
991 		fdentry = fdt->fd + (fd & nospec_mask);
992 
993 		/* Do the load, then mask any invalid result */
994 		file = rcu_dereference_raw(*fdentry);
995 		file = (void *)(nospec_mask & (unsigned long)file);
996 		if (unlikely(!file))
997 			return NULL;
998 
999 		/*
1000 		 * Ok, we have a file pointer that was valid at
1001 		 * some point, but it might have become stale since.
1002 		 *
1003 		 * We need to confirm it by incrementing the refcount
1004 		 * and then check the lookup again.
1005 		 *
1006 		 * file_ref_get() gives us a full memory barrier. We
1007 		 * only really need an 'acquire' one to protect the
1008 		 * loads below, but we don't have that.
1009 		 */
1010 		if (unlikely(!file_ref_get(&file->f_ref)))
1011 			continue;
1012 
1013 		/*
1014 		 * Such a race can take two forms:
1015 		 *
1016 		 *  (a) the file ref already went down to zero and the
1017 		 *      file hasn't been reused yet or the file count
1018 		 *      isn't zero but the file has already been reused.
1019 		 *
1020 		 *  (b) the file table entry has changed under us.
1021 		 *       Note that we don't need to re-check the 'fdt->fd'
1022 		 *       pointer having changed, because it always goes
1023 		 *       hand-in-hand with 'fdt'.
1024 		 *
1025 		 * If so, we need to put our ref and try again.
1026 		 */
1027 		if (unlikely(file != rcu_dereference_raw(*fdentry)) ||
1028 		    unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
1029 			fput(file);
1030 			continue;
1031 		}
1032 
1033 		/*
1034 		 * This isn't the file we're looking for or we're not
1035 		 * allowed to get a reference to it.
1036 		 */
1037 		if (unlikely(file->f_mode & mask)) {
1038 			fput(file);
1039 			return NULL;
1040 		}
1041 
1042 		/*
1043 		 * Ok, we have a ref to the file, and checked that it
1044 		 * still exists.
1045 		 */
1046 		return file;
1047 	}
1048 }
1049 
__fget_files(struct files_struct * files,unsigned int fd,fmode_t mask)1050 static struct file *__fget_files(struct files_struct *files, unsigned int fd,
1051 				 fmode_t mask)
1052 {
1053 	struct file *file;
1054 
1055 	rcu_read_lock();
1056 	file = __fget_files_rcu(files, fd, mask);
1057 	rcu_read_unlock();
1058 
1059 	return file;
1060 }
1061 
__fget(unsigned int fd,fmode_t mask)1062 static inline struct file *__fget(unsigned int fd, fmode_t mask)
1063 {
1064 	return __fget_files(current->files, fd, mask);
1065 }
1066 
fget(unsigned int fd)1067 struct file *fget(unsigned int fd)
1068 {
1069 	return __fget(fd, FMODE_PATH);
1070 }
1071 EXPORT_SYMBOL(fget);
1072 
fget_raw(unsigned int fd)1073 struct file *fget_raw(unsigned int fd)
1074 {
1075 	return __fget(fd, 0);
1076 }
1077 EXPORT_SYMBOL(fget_raw);
1078 
fget_task(struct task_struct * task,unsigned int fd)1079 struct file *fget_task(struct task_struct *task, unsigned int fd)
1080 {
1081 	struct file *file = NULL;
1082 
1083 	task_lock(task);
1084 	if (task->files)
1085 		file = __fget_files(task->files, fd, 0);
1086 	task_unlock(task);
1087 
1088 	return file;
1089 }
1090 
fget_task_next(struct task_struct * task,unsigned int * ret_fd)1091 struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd)
1092 {
1093 	/* Must be called with rcu_read_lock held */
1094 	struct files_struct *files;
1095 	unsigned int fd = *ret_fd;
1096 	struct file *file = NULL;
1097 
1098 	task_lock(task);
1099 	files = task->files;
1100 	if (files) {
1101 		rcu_read_lock();
1102 		for (; fd < files_fdtable(files)->max_fds; fd++) {
1103 			file = __fget_files_rcu(files, fd, 0);
1104 			if (file)
1105 				break;
1106 		}
1107 		rcu_read_unlock();
1108 	}
1109 	task_unlock(task);
1110 	*ret_fd = fd;
1111 	return file;
1112 }
1113 EXPORT_SYMBOL(fget_task_next);
1114 
1115 /*
1116  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
1117  *
1118  * You can use this instead of fget if you satisfy all of the following
1119  * conditions:
1120  * 1) You must call fput_light before exiting the syscall and returning control
1121  *    to userspace (i.e. you cannot remember the returned struct file * after
1122  *    returning to userspace).
1123  * 2) You must not call filp_close on the returned struct file * in between
1124  *    calls to fget_light and fput_light.
1125  * 3) You must not clone the current task in between the calls to fget_light
1126  *    and fput_light.
1127  *
1128  * The fput_needed flag returned by fget_light should be passed to the
1129  * corresponding fput_light.
1130  *
1131  * (As an exception to rule 2, you can call filp_close between fget_light and
1132  * fput_light provided that you capture a real refcount with get_file before
1133  * the call to filp_close, and ensure that this real refcount is fput *after*
1134  * the fput_light call.)
1135  *
1136  * See also the documentation in rust/kernel/file.rs.
1137  */
__fget_light(unsigned int fd,fmode_t mask)1138 static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
1139 {
1140 	struct files_struct *files = current->files;
1141 	struct file *file;
1142 
1143 	/*
1144 	 * If another thread is concurrently calling close_fd() followed
1145 	 * by put_files_struct(), we must not observe the old table
1146 	 * entry combined with the new refcount - otherwise we could
1147 	 * return a file that is concurrently being freed.
1148 	 *
1149 	 * atomic_read_acquire() pairs with atomic_dec_and_test() in
1150 	 * put_files_struct().
1151 	 */
1152 	if (likely(atomic_read_acquire(&files->count) == 1)) {
1153 		file = files_lookup_fd_raw(files, fd);
1154 		if (!file || unlikely(file->f_mode & mask))
1155 			return EMPTY_FD;
1156 		return BORROWED_FD(file);
1157 	} else {
1158 		file = __fget_files(files, fd, mask);
1159 		if (!file)
1160 			return EMPTY_FD;
1161 		return CLONED_FD(file);
1162 	}
1163 }
fdget(unsigned int fd)1164 struct fd fdget(unsigned int fd)
1165 {
1166 	return __fget_light(fd, FMODE_PATH);
1167 }
1168 EXPORT_SYMBOL(fdget);
1169 
fdget_raw(unsigned int fd)1170 struct fd fdget_raw(unsigned int fd)
1171 {
1172 	return __fget_light(fd, 0);
1173 }
1174 
1175 /*
1176  * Try to avoid f_pos locking. We only need it if the
1177  * file is marked for FMODE_ATOMIC_POS, and it can be
1178  * accessed multiple ways.
1179  *
1180  * Always do it for directories, because pidfd_getfd()
1181  * can make a file accessible even if it otherwise would
1182  * not be, and for directories this is a correctness
1183  * issue, not a "POSIX requirement".
1184  */
file_needs_f_pos_lock(struct file * file)1185 static inline bool file_needs_f_pos_lock(struct file *file)
1186 {
1187 	if (!(file->f_mode & FMODE_ATOMIC_POS))
1188 		return false;
1189 	if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF)
1190 		return true;
1191 	if (file->f_op->iterate_shared)
1192 		return true;
1193 	return false;
1194 }
1195 
file_seek_cur_needs_f_lock(struct file * file)1196 bool file_seek_cur_needs_f_lock(struct file *file)
1197 {
1198 	if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared)
1199 		return false;
1200 
1201 	/*
1202 	 * Note that we are not guaranteed to be called after fdget_pos() on
1203 	 * this file obj, in which case the caller is expected to provide the
1204 	 * appropriate locking.
1205 	 */
1206 
1207 	return true;
1208 }
1209 
fdget_pos(unsigned int fd)1210 struct fd fdget_pos(unsigned int fd)
1211 {
1212 	struct fd f = fdget(fd);
1213 	struct file *file = fd_file(f);
1214 
1215 	if (likely(file) && file_needs_f_pos_lock(file)) {
1216 		f.word |= FDPUT_POS_UNLOCK;
1217 		mutex_lock(&file->f_pos_lock);
1218 	}
1219 	return f;
1220 }
1221 
__f_unlock_pos(struct file * f)1222 void __f_unlock_pos(struct file *f)
1223 {
1224 	mutex_unlock(&f->f_pos_lock);
1225 }
1226 
1227 /*
1228  * We only lock f_pos if we have threads or if the file might be
1229  * shared with another process. In both cases we'll have an elevated
1230  * file count (done either by fdget() or by fork()).
1231  */
1232 
set_close_on_exec(unsigned int fd,int flag)1233 void set_close_on_exec(unsigned int fd, int flag)
1234 {
1235 	struct files_struct *files = current->files;
1236 	spin_lock(&files->file_lock);
1237 	__set_close_on_exec(fd, files_fdtable(files), flag);
1238 	spin_unlock(&files->file_lock);
1239 }
1240 
get_close_on_exec(unsigned int fd)1241 bool get_close_on_exec(unsigned int fd)
1242 {
1243 	bool res;
1244 	rcu_read_lock();
1245 	res = close_on_exec(fd, current->files);
1246 	rcu_read_unlock();
1247 	return res;
1248 }
1249 
do_dup2(struct files_struct * files,struct file * file,unsigned fd,unsigned flags)1250 static int do_dup2(struct files_struct *files,
1251 	struct file *file, unsigned fd, unsigned flags)
1252 __releases(&files->file_lock)
1253 {
1254 	struct file *tofree;
1255 	struct fdtable *fdt;
1256 
1257 	/*
1258 	 * dup2() is expected to close the file installed in the target fd slot
1259 	 * (if any). However, userspace hand-picking a fd may be racing against
1260 	 * its own threads which happened to allocate it in open() et al but did
1261 	 * not populate it yet.
1262 	 *
1263 	 * Broadly speaking we may be racing against the following:
1264 	 * fd = get_unused_fd_flags();     // fd slot reserved, ->fd[fd] == NULL
1265 	 * file = hard_work_goes_here();
1266 	 * fd_install(fd, file);           // only now ->fd[fd] == file
1267 	 *
1268 	 * It is an invariant that a successfully allocated fd has a NULL entry
1269 	 * in the array until the matching fd_install().
1270 	 *
1271 	 * If we fit the window, we have the fd to populate, yet no target file
1272 	 * to close. Trying to ignore it and install our new file would violate
1273 	 * the invariant and make fd_install() overwrite our file.
1274 	 *
1275 	 * Things can be done(tm) to handle this. However, the issue does not
1276 	 * concern legitimate programs and we only need to make sure the kernel
1277 	 * does not trip over it.
1278 	 *
1279 	 * The simplest way out is to return an error if we find ourselves here.
1280 	 *
1281 	 * POSIX is silent on the issue, we return -EBUSY.
1282 	 */
1283 	fdt = files_fdtable(files);
1284 	fd = array_index_nospec(fd, fdt->max_fds);
1285 	tofree = rcu_dereference_raw(fdt->fd[fd]);
1286 	if (!tofree && fd_is_open(fd, fdt))
1287 		goto Ebusy;
1288 	get_file(file);
1289 	rcu_assign_pointer(fdt->fd[fd], file);
1290 	__set_open_fd(fd, fdt, flags & O_CLOEXEC);
1291 	spin_unlock(&files->file_lock);
1292 
1293 	if (tofree)
1294 		filp_close(tofree, files);
1295 
1296 	return fd;
1297 
1298 Ebusy:
1299 	spin_unlock(&files->file_lock);
1300 	return -EBUSY;
1301 }
1302 
replace_fd(unsigned fd,struct file * file,unsigned flags)1303 int replace_fd(unsigned fd, struct file *file, unsigned flags)
1304 {
1305 	int err;
1306 	struct files_struct *files = current->files;
1307 
1308 	if (!file)
1309 		return close_fd(fd);
1310 
1311 	if (fd >= rlimit(RLIMIT_NOFILE))
1312 		return -EBADF;
1313 
1314 	spin_lock(&files->file_lock);
1315 	err = expand_files(files, fd);
1316 	if (unlikely(err < 0))
1317 		goto out_unlock;
1318 	return do_dup2(files, file, fd, flags);
1319 
1320 out_unlock:
1321 	spin_unlock(&files->file_lock);
1322 	return err;
1323 }
1324 
1325 /**
1326  * receive_fd() - Install received file into file descriptor table
1327  * @file: struct file that was received from another process
1328  * @ufd: __user pointer to write new fd number to
1329  * @o_flags: the O_* flags to apply to the new fd entry
1330  *
1331  * Installs a received file into the file descriptor table, with appropriate
1332  * checks and count updates. Optionally writes the fd number to userspace, if
1333  * @ufd is non-NULL.
1334  *
1335  * This helper handles its own reference counting of the incoming
1336  * struct file.
1337  *
1338  * Returns newly install fd or -ve on error.
1339  */
receive_fd(struct file * file,int __user * ufd,unsigned int o_flags)1340 int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
1341 {
1342 	int new_fd;
1343 	int error;
1344 
1345 	error = security_file_receive(file);
1346 	if (error)
1347 		return error;
1348 
1349 	new_fd = get_unused_fd_flags(o_flags);
1350 	if (new_fd < 0)
1351 		return new_fd;
1352 
1353 	if (ufd) {
1354 		error = put_user(new_fd, ufd);
1355 		if (error) {
1356 			put_unused_fd(new_fd);
1357 			return error;
1358 		}
1359 	}
1360 
1361 	fd_install(new_fd, get_file(file));
1362 	__receive_sock(file);
1363 	return new_fd;
1364 }
1365 EXPORT_SYMBOL_GPL(receive_fd);
1366 
receive_fd_replace(int new_fd,struct file * file,unsigned int o_flags)1367 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
1368 {
1369 	int error;
1370 
1371 	error = security_file_receive(file);
1372 	if (error)
1373 		return error;
1374 	error = replace_fd(new_fd, file, o_flags);
1375 	if (error)
1376 		return error;
1377 	__receive_sock(file);
1378 	return new_fd;
1379 }
1380 
ksys_dup3(unsigned int oldfd,unsigned int newfd,int flags)1381 static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
1382 {
1383 	int err = -EBADF;
1384 	struct file *file;
1385 	struct files_struct *files = current->files;
1386 
1387 	if ((flags & ~O_CLOEXEC) != 0)
1388 		return -EINVAL;
1389 
1390 	if (unlikely(oldfd == newfd))
1391 		return -EINVAL;
1392 
1393 	if (newfd >= rlimit(RLIMIT_NOFILE))
1394 		return -EBADF;
1395 
1396 	spin_lock(&files->file_lock);
1397 	err = expand_files(files, newfd);
1398 	file = files_lookup_fd_locked(files, oldfd);
1399 	if (unlikely(!file))
1400 		goto Ebadf;
1401 	if (unlikely(err < 0)) {
1402 		if (err == -EMFILE)
1403 			goto Ebadf;
1404 		goto out_unlock;
1405 	}
1406 	return do_dup2(files, file, newfd, flags);
1407 
1408 Ebadf:
1409 	err = -EBADF;
1410 out_unlock:
1411 	spin_unlock(&files->file_lock);
1412 	return err;
1413 }
1414 
SYSCALL_DEFINE3(dup3,unsigned int,oldfd,unsigned int,newfd,int,flags)1415 SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
1416 {
1417 	return ksys_dup3(oldfd, newfd, flags);
1418 }
1419 
SYSCALL_DEFINE2(dup2,unsigned int,oldfd,unsigned int,newfd)1420 SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
1421 {
1422 	if (unlikely(newfd == oldfd)) { /* corner case */
1423 		struct files_struct *files = current->files;
1424 		struct file *f;
1425 		int retval = oldfd;
1426 
1427 		rcu_read_lock();
1428 		f = __fget_files_rcu(files, oldfd, 0);
1429 		if (!f)
1430 			retval = -EBADF;
1431 		rcu_read_unlock();
1432 		if (f)
1433 			fput(f);
1434 		return retval;
1435 	}
1436 	return ksys_dup3(oldfd, newfd, 0);
1437 }
1438 
SYSCALL_DEFINE1(dup,unsigned int,fildes)1439 SYSCALL_DEFINE1(dup, unsigned int, fildes)
1440 {
1441 	int ret = -EBADF;
1442 	struct file *file = fget_raw(fildes);
1443 
1444 	if (file) {
1445 		ret = get_unused_fd_flags(0);
1446 		if (ret >= 0)
1447 			fd_install(ret, file);
1448 		else
1449 			fput(file);
1450 	}
1451 	return ret;
1452 }
1453 
f_dupfd(unsigned int from,struct file * file,unsigned flags)1454 int f_dupfd(unsigned int from, struct file *file, unsigned flags)
1455 {
1456 	unsigned long nofile = rlimit(RLIMIT_NOFILE);
1457 	int err;
1458 	if (from >= nofile)
1459 		return -EINVAL;
1460 	err = alloc_fd(from, nofile, flags);
1461 	if (err >= 0) {
1462 		get_file(file);
1463 		fd_install(err, file);
1464 	}
1465 	return err;
1466 }
1467 
iterate_fd(struct files_struct * files,unsigned n,int (* f)(const void *,struct file *,unsigned),const void * p)1468 int iterate_fd(struct files_struct *files, unsigned n,
1469 		int (*f)(const void *, struct file *, unsigned),
1470 		const void *p)
1471 {
1472 	struct fdtable *fdt;
1473 	int res = 0;
1474 	if (!files)
1475 		return 0;
1476 	spin_lock(&files->file_lock);
1477 	for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
1478 		struct file *file;
1479 		file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
1480 		if (!file)
1481 			continue;
1482 		res = f(p, file, n);
1483 		if (res)
1484 			break;
1485 	}
1486 	spin_unlock(&files->file_lock);
1487 	return res;
1488 }
1489 EXPORT_SYMBOL(iterate_fd);
1490