xref: /linux/io_uring/rsrc.c (revision c7546e2c3cb739a3c1a2f5acaf9bb629d401afe5)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/fs.h>
5 #include <linux/file.h>
6 #include <linux/mm.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
12 
13 #include <uapi/linux/io_uring.h>
14 
15 #include "io_uring.h"
16 #include "alloc_cache.h"
17 #include "openclose.h"
18 #include "rsrc.h"
19 #include "memmap.h"
20 #include "register.h"
21 
22 struct io_rsrc_update {
23 	struct file			*file;
24 	u64				arg;
25 	u32				nr_args;
26 	u32				offset;
27 };
28 
29 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
30 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
31 				  struct io_mapped_ubuf **pimu,
32 				  struct page **last_hpage);
33 
34 /* only define max */
35 #define IORING_MAX_FIXED_FILES	(1U << 20)
36 #define IORING_MAX_REG_BUFFERS	(1U << 14)
37 
38 static const struct io_mapped_ubuf dummy_ubuf = {
39 	/* set invalid range, so io_import_fixed() fails meeting it */
40 	.ubuf = -1UL,
41 	.len = UINT_MAX,
42 };
43 
44 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
45 {
46 	unsigned long page_limit, cur_pages, new_pages;
47 
48 	if (!nr_pages)
49 		return 0;
50 
51 	/* Don't allow more pages than we can safely lock */
52 	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
53 
54 	cur_pages = atomic_long_read(&user->locked_vm);
55 	do {
56 		new_pages = cur_pages + nr_pages;
57 		if (new_pages > page_limit)
58 			return -ENOMEM;
59 	} while (!atomic_long_try_cmpxchg(&user->locked_vm,
60 					  &cur_pages, new_pages));
61 	return 0;
62 }
63 
64 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
65 {
66 	if (ctx->user)
67 		__io_unaccount_mem(ctx->user, nr_pages);
68 
69 	if (ctx->mm_account)
70 		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
71 }
72 
73 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
74 {
75 	int ret;
76 
77 	if (ctx->user) {
78 		ret = __io_account_mem(ctx->user, nr_pages);
79 		if (ret)
80 			return ret;
81 	}
82 
83 	if (ctx->mm_account)
84 		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
85 
86 	return 0;
87 }
88 
89 static int io_buffer_validate(struct iovec *iov)
90 {
91 	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
92 
93 	/*
94 	 * Don't impose further limits on the size and buffer
95 	 * constraints here, we'll -EINVAL later when IO is
96 	 * submitted if they are wrong.
97 	 */
98 	if (!iov->iov_base)
99 		return iov->iov_len ? -EFAULT : 0;
100 	if (!iov->iov_len)
101 		return -EFAULT;
102 
103 	/* arbitrary limit, but we need something */
104 	if (iov->iov_len > SZ_1G)
105 		return -EFAULT;
106 
107 	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
108 		return -EOVERFLOW;
109 
110 	return 0;
111 }
112 
113 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
114 {
115 	struct io_mapped_ubuf *imu = *slot;
116 	unsigned int i;
117 
118 	*slot = NULL;
119 	if (imu != &dummy_ubuf) {
120 		if (!refcount_dec_and_test(&imu->refs))
121 			return;
122 		for (i = 0; i < imu->nr_bvecs; i++)
123 			unpin_user_page(imu->bvec[i].bv_page);
124 		if (imu->acct_pages)
125 			io_unaccount_mem(ctx, imu->acct_pages);
126 		kvfree(imu);
127 	}
128 }
129 
130 static void io_rsrc_put_work(struct io_rsrc_node *node)
131 {
132 	struct io_rsrc_put *prsrc = &node->item;
133 
134 	if (prsrc->tag)
135 		io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0);
136 
137 	switch (node->type) {
138 	case IORING_RSRC_FILE:
139 		fput(prsrc->file);
140 		break;
141 	case IORING_RSRC_BUFFER:
142 		io_rsrc_buf_put(node->ctx, prsrc);
143 		break;
144 	default:
145 		WARN_ON_ONCE(1);
146 		break;
147 	}
148 }
149 
150 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
151 {
152 	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
153 		kfree(node);
154 }
155 
156 void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
157 	__must_hold(&node->ctx->uring_lock)
158 {
159 	struct io_ring_ctx *ctx = node->ctx;
160 
161 	while (!list_empty(&ctx->rsrc_ref_list)) {
162 		node = list_first_entry(&ctx->rsrc_ref_list,
163 					    struct io_rsrc_node, node);
164 		/* recycle ref nodes in order */
165 		if (node->refs)
166 			break;
167 		list_del(&node->node);
168 
169 		if (likely(!node->empty))
170 			io_rsrc_put_work(node);
171 		io_rsrc_node_destroy(ctx, node);
172 	}
173 	if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
174 		wake_up_all(&ctx->rsrc_quiesce_wq);
175 }
176 
177 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
178 {
179 	struct io_rsrc_node *ref_node;
180 
181 	ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache);
182 	if (!ref_node) {
183 		ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
184 		if (!ref_node)
185 			return NULL;
186 	}
187 
188 	ref_node->ctx = ctx;
189 	ref_node->empty = 0;
190 	ref_node->refs = 1;
191 	return ref_node;
192 }
193 
194 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
195 				      struct io_ring_ctx *ctx)
196 {
197 	struct io_rsrc_node *backup;
198 	DEFINE_WAIT(we);
199 	int ret;
200 
201 	/* As We may drop ->uring_lock, other task may have started quiesce */
202 	if (data->quiesce)
203 		return -ENXIO;
204 
205 	backup = io_rsrc_node_alloc(ctx);
206 	if (!backup)
207 		return -ENOMEM;
208 	ctx->rsrc_node->empty = true;
209 	ctx->rsrc_node->type = -1;
210 	list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
211 	io_put_rsrc_node(ctx, ctx->rsrc_node);
212 	ctx->rsrc_node = backup;
213 
214 	if (list_empty(&ctx->rsrc_ref_list))
215 		return 0;
216 
217 	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
218 		atomic_set(&ctx->cq_wait_nr, 1);
219 		smp_mb();
220 	}
221 
222 	ctx->rsrc_quiesce++;
223 	data->quiesce = true;
224 	do {
225 		prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
226 		mutex_unlock(&ctx->uring_lock);
227 
228 		ret = io_run_task_work_sig(ctx);
229 		if (ret < 0) {
230 			finish_wait(&ctx->rsrc_quiesce_wq, &we);
231 			mutex_lock(&ctx->uring_lock);
232 			if (list_empty(&ctx->rsrc_ref_list))
233 				ret = 0;
234 			break;
235 		}
236 
237 		schedule();
238 		mutex_lock(&ctx->uring_lock);
239 		ret = 0;
240 	} while (!list_empty(&ctx->rsrc_ref_list));
241 
242 	finish_wait(&ctx->rsrc_quiesce_wq, &we);
243 	data->quiesce = false;
244 	ctx->rsrc_quiesce--;
245 
246 	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
247 		atomic_set(&ctx->cq_wait_nr, 0);
248 		smp_mb();
249 	}
250 	return ret;
251 }
252 
253 static void io_free_page_table(void **table, size_t size)
254 {
255 	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
256 
257 	for (i = 0; i < nr_tables; i++)
258 		kfree(table[i]);
259 	kfree(table);
260 }
261 
262 static void io_rsrc_data_free(struct io_rsrc_data *data)
263 {
264 	size_t size = data->nr * sizeof(data->tags[0][0]);
265 
266 	if (data->tags)
267 		io_free_page_table((void **)data->tags, size);
268 	kfree(data);
269 }
270 
271 static __cold void **io_alloc_page_table(size_t size)
272 {
273 	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
274 	size_t init_size = size;
275 	void **table;
276 
277 	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
278 	if (!table)
279 		return NULL;
280 
281 	for (i = 0; i < nr_tables; i++) {
282 		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
283 
284 		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
285 		if (!table[i]) {
286 			io_free_page_table(table, init_size);
287 			return NULL;
288 		}
289 		size -= this_size;
290 	}
291 	return table;
292 }
293 
294 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type,
295 				     u64 __user *utags,
296 				     unsigned nr, struct io_rsrc_data **pdata)
297 {
298 	struct io_rsrc_data *data;
299 	int ret = 0;
300 	unsigned i;
301 
302 	data = kzalloc(sizeof(*data), GFP_KERNEL);
303 	if (!data)
304 		return -ENOMEM;
305 	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
306 	if (!data->tags) {
307 		kfree(data);
308 		return -ENOMEM;
309 	}
310 
311 	data->nr = nr;
312 	data->ctx = ctx;
313 	data->rsrc_type = type;
314 	if (utags) {
315 		ret = -EFAULT;
316 		for (i = 0; i < nr; i++) {
317 			u64 *tag_slot = io_get_tag_slot(data, i);
318 
319 			if (copy_from_user(tag_slot, &utags[i],
320 					   sizeof(*tag_slot)))
321 				goto fail;
322 		}
323 	}
324 	*pdata = data;
325 	return 0;
326 fail:
327 	io_rsrc_data_free(data);
328 	return ret;
329 }
330 
331 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
332 				 struct io_uring_rsrc_update2 *up,
333 				 unsigned nr_args)
334 {
335 	u64 __user *tags = u64_to_user_ptr(up->tags);
336 	__s32 __user *fds = u64_to_user_ptr(up->data);
337 	struct io_rsrc_data *data = ctx->file_data;
338 	struct io_fixed_file *file_slot;
339 	int fd, i, err = 0;
340 	unsigned int done;
341 
342 	if (!ctx->file_data)
343 		return -ENXIO;
344 	if (up->offset + nr_args > ctx->nr_user_files)
345 		return -EINVAL;
346 
347 	for (done = 0; done < nr_args; done++) {
348 		u64 tag = 0;
349 
350 		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
351 		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
352 			err = -EFAULT;
353 			break;
354 		}
355 		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
356 			err = -EINVAL;
357 			break;
358 		}
359 		if (fd == IORING_REGISTER_FILES_SKIP)
360 			continue;
361 
362 		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
363 		file_slot = io_fixed_file_slot(&ctx->file_table, i);
364 
365 		if (file_slot->file_ptr) {
366 			err = io_queue_rsrc_removal(data, i,
367 						    io_slot_file(file_slot));
368 			if (err)
369 				break;
370 			file_slot->file_ptr = 0;
371 			io_file_bitmap_clear(&ctx->file_table, i);
372 		}
373 		if (fd != -1) {
374 			struct file *file = fget(fd);
375 
376 			if (!file) {
377 				err = -EBADF;
378 				break;
379 			}
380 			/*
381 			 * Don't allow io_uring instances to be registered.
382 			 */
383 			if (io_is_uring_fops(file)) {
384 				fput(file);
385 				err = -EBADF;
386 				break;
387 			}
388 			*io_get_tag_slot(data, i) = tag;
389 			io_fixed_file_set(file_slot, file);
390 			io_file_bitmap_set(&ctx->file_table, i);
391 		}
392 	}
393 	return done ? done : err;
394 }
395 
396 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
397 				   struct io_uring_rsrc_update2 *up,
398 				   unsigned int nr_args)
399 {
400 	u64 __user *tags = u64_to_user_ptr(up->tags);
401 	struct iovec fast_iov, *iov;
402 	struct page *last_hpage = NULL;
403 	struct iovec __user *uvec;
404 	u64 user_data = up->data;
405 	__u32 done;
406 	int i, err;
407 
408 	if (!ctx->buf_data)
409 		return -ENXIO;
410 	if (up->offset + nr_args > ctx->nr_user_bufs)
411 		return -EINVAL;
412 
413 	for (done = 0; done < nr_args; done++) {
414 		struct io_mapped_ubuf *imu;
415 		u64 tag = 0;
416 
417 		uvec = u64_to_user_ptr(user_data);
418 		iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
419 		if (IS_ERR(iov)) {
420 			err = PTR_ERR(iov);
421 			break;
422 		}
423 		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
424 			err = -EFAULT;
425 			break;
426 		}
427 		err = io_buffer_validate(iov);
428 		if (err)
429 			break;
430 		if (!iov->iov_base && tag) {
431 			err = -EINVAL;
432 			break;
433 		}
434 		err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage);
435 		if (err)
436 			break;
437 
438 		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
439 		if (ctx->user_bufs[i] != &dummy_ubuf) {
440 			err = io_queue_rsrc_removal(ctx->buf_data, i,
441 						    ctx->user_bufs[i]);
442 			if (unlikely(err)) {
443 				io_buffer_unmap(ctx, &imu);
444 				break;
445 			}
446 			ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
447 		}
448 
449 		ctx->user_bufs[i] = imu;
450 		*io_get_tag_slot(ctx->buf_data, i) = tag;
451 		if (ctx->compat)
452 			user_data += sizeof(struct compat_iovec);
453 		else
454 			user_data += sizeof(struct iovec);
455 	}
456 	return done ? done : err;
457 }
458 
459 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
460 				     struct io_uring_rsrc_update2 *up,
461 				     unsigned nr_args)
462 {
463 	__u32 tmp;
464 
465 	lockdep_assert_held(&ctx->uring_lock);
466 
467 	if (check_add_overflow(up->offset, nr_args, &tmp))
468 		return -EOVERFLOW;
469 
470 	switch (type) {
471 	case IORING_RSRC_FILE:
472 		return __io_sqe_files_update(ctx, up, nr_args);
473 	case IORING_RSRC_BUFFER:
474 		return __io_sqe_buffers_update(ctx, up, nr_args);
475 	}
476 	return -EINVAL;
477 }
478 
479 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
480 			     unsigned nr_args)
481 {
482 	struct io_uring_rsrc_update2 up;
483 
484 	if (!nr_args)
485 		return -EINVAL;
486 	memset(&up, 0, sizeof(up));
487 	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
488 		return -EFAULT;
489 	if (up.resv || up.resv2)
490 		return -EINVAL;
491 	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
492 }
493 
494 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
495 			    unsigned size, unsigned type)
496 {
497 	struct io_uring_rsrc_update2 up;
498 
499 	if (size != sizeof(up))
500 		return -EINVAL;
501 	if (copy_from_user(&up, arg, sizeof(up)))
502 		return -EFAULT;
503 	if (!up.nr || up.resv || up.resv2)
504 		return -EINVAL;
505 	return __io_register_rsrc_update(ctx, type, &up, up.nr);
506 }
507 
508 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
509 			    unsigned int size, unsigned int type)
510 {
511 	struct io_uring_rsrc_register rr;
512 
513 	/* keep it extendible */
514 	if (size != sizeof(rr))
515 		return -EINVAL;
516 
517 	memset(&rr, 0, sizeof(rr));
518 	if (copy_from_user(&rr, arg, size))
519 		return -EFAULT;
520 	if (!rr.nr || rr.resv2)
521 		return -EINVAL;
522 	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
523 		return -EINVAL;
524 
525 	switch (type) {
526 	case IORING_RSRC_FILE:
527 		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
528 			break;
529 		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
530 					     rr.nr, u64_to_user_ptr(rr.tags));
531 	case IORING_RSRC_BUFFER:
532 		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
533 			break;
534 		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
535 					       rr.nr, u64_to_user_ptr(rr.tags));
536 	}
537 	return -EINVAL;
538 }
539 
540 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
541 {
542 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
543 
544 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
545 		return -EINVAL;
546 	if (sqe->rw_flags || sqe->splice_fd_in)
547 		return -EINVAL;
548 
549 	up->offset = READ_ONCE(sqe->off);
550 	up->nr_args = READ_ONCE(sqe->len);
551 	if (!up->nr_args)
552 		return -EINVAL;
553 	up->arg = READ_ONCE(sqe->addr);
554 	return 0;
555 }
556 
557 static int io_files_update_with_index_alloc(struct io_kiocb *req,
558 					    unsigned int issue_flags)
559 {
560 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
561 	__s32 __user *fds = u64_to_user_ptr(up->arg);
562 	unsigned int done;
563 	struct file *file;
564 	int ret, fd;
565 
566 	if (!req->ctx->file_data)
567 		return -ENXIO;
568 
569 	for (done = 0; done < up->nr_args; done++) {
570 		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
571 			ret = -EFAULT;
572 			break;
573 		}
574 
575 		file = fget(fd);
576 		if (!file) {
577 			ret = -EBADF;
578 			break;
579 		}
580 		ret = io_fixed_fd_install(req, issue_flags, file,
581 					  IORING_FILE_INDEX_ALLOC);
582 		if (ret < 0)
583 			break;
584 		if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
585 			__io_close_fixed(req->ctx, issue_flags, ret);
586 			ret = -EFAULT;
587 			break;
588 		}
589 	}
590 
591 	if (done)
592 		return done;
593 	return ret;
594 }
595 
596 int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
597 {
598 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
599 	struct io_ring_ctx *ctx = req->ctx;
600 	struct io_uring_rsrc_update2 up2;
601 	int ret;
602 
603 	up2.offset = up->offset;
604 	up2.data = up->arg;
605 	up2.nr = 0;
606 	up2.tags = 0;
607 	up2.resv = 0;
608 	up2.resv2 = 0;
609 
610 	if (up->offset == IORING_FILE_INDEX_ALLOC) {
611 		ret = io_files_update_with_index_alloc(req, issue_flags);
612 	} else {
613 		io_ring_submit_lock(ctx, issue_flags);
614 		ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
615 						&up2, up->nr_args);
616 		io_ring_submit_unlock(ctx, issue_flags);
617 	}
618 
619 	if (ret < 0)
620 		req_set_fail(req);
621 	io_req_set_res(req, ret, 0);
622 	return IOU_OK;
623 }
624 
625 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
626 {
627 	struct io_ring_ctx *ctx = data->ctx;
628 	struct io_rsrc_node *node = ctx->rsrc_node;
629 	u64 *tag_slot = io_get_tag_slot(data, idx);
630 
631 	ctx->rsrc_node = io_rsrc_node_alloc(ctx);
632 	if (unlikely(!ctx->rsrc_node)) {
633 		ctx->rsrc_node = node;
634 		return -ENOMEM;
635 	}
636 
637 	node->item.rsrc = rsrc;
638 	node->type = data->rsrc_type;
639 	node->item.tag = *tag_slot;
640 	*tag_slot = 0;
641 	list_add_tail(&node->node, &ctx->rsrc_ref_list);
642 	io_put_rsrc_node(ctx, node);
643 	return 0;
644 }
645 
646 void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
647 {
648 	int i;
649 
650 	for (i = 0; i < ctx->nr_user_files; i++) {
651 		struct file *file = io_file_from_index(&ctx->file_table, i);
652 
653 		if (!file)
654 			continue;
655 		io_file_bitmap_clear(&ctx->file_table, i);
656 		fput(file);
657 	}
658 
659 	io_free_file_tables(&ctx->file_table);
660 	io_file_table_set_alloc_range(ctx, 0, 0);
661 	io_rsrc_data_free(ctx->file_data);
662 	ctx->file_data = NULL;
663 	ctx->nr_user_files = 0;
664 }
665 
666 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
667 {
668 	unsigned nr = ctx->nr_user_files;
669 	int ret;
670 
671 	if (!ctx->file_data)
672 		return -ENXIO;
673 
674 	/*
675 	 * Quiesce may unlock ->uring_lock, and while it's not held
676 	 * prevent new requests using the table.
677 	 */
678 	ctx->nr_user_files = 0;
679 	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
680 	ctx->nr_user_files = nr;
681 	if (!ret)
682 		__io_sqe_files_unregister(ctx);
683 	return ret;
684 }
685 
686 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
687 			  unsigned nr_args, u64 __user *tags)
688 {
689 	__s32 __user *fds = (__s32 __user *) arg;
690 	struct file *file;
691 	int fd, ret;
692 	unsigned i;
693 
694 	if (ctx->file_data)
695 		return -EBUSY;
696 	if (!nr_args)
697 		return -EINVAL;
698 	if (nr_args > IORING_MAX_FIXED_FILES)
699 		return -EMFILE;
700 	if (nr_args > rlimit(RLIMIT_NOFILE))
701 		return -EMFILE;
702 	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args,
703 				 &ctx->file_data);
704 	if (ret)
705 		return ret;
706 
707 	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
708 		io_rsrc_data_free(ctx->file_data);
709 		ctx->file_data = NULL;
710 		return -ENOMEM;
711 	}
712 
713 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
714 		struct io_fixed_file *file_slot;
715 
716 		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
717 			ret = -EFAULT;
718 			goto fail;
719 		}
720 		/* allow sparse sets */
721 		if (!fds || fd == -1) {
722 			ret = -EINVAL;
723 			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
724 				goto fail;
725 			continue;
726 		}
727 
728 		file = fget(fd);
729 		ret = -EBADF;
730 		if (unlikely(!file))
731 			goto fail;
732 
733 		/*
734 		 * Don't allow io_uring instances to be registered.
735 		 */
736 		if (io_is_uring_fops(file)) {
737 			fput(file);
738 			goto fail;
739 		}
740 		file_slot = io_fixed_file_slot(&ctx->file_table, i);
741 		io_fixed_file_set(file_slot, file);
742 		io_file_bitmap_set(&ctx->file_table, i);
743 	}
744 
745 	/* default it to the whole table */
746 	io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
747 	return 0;
748 fail:
749 	__io_sqe_files_unregister(ctx);
750 	return ret;
751 }
752 
753 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
754 {
755 	io_buffer_unmap(ctx, &prsrc->buf);
756 	prsrc->buf = NULL;
757 }
758 
759 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
760 {
761 	unsigned int i;
762 
763 	for (i = 0; i < ctx->nr_user_bufs; i++)
764 		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
765 	kfree(ctx->user_bufs);
766 	io_rsrc_data_free(ctx->buf_data);
767 	ctx->user_bufs = NULL;
768 	ctx->buf_data = NULL;
769 	ctx->nr_user_bufs = 0;
770 }
771 
772 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
773 {
774 	unsigned nr = ctx->nr_user_bufs;
775 	int ret;
776 
777 	if (!ctx->buf_data)
778 		return -ENXIO;
779 
780 	/*
781 	 * Quiesce may unlock ->uring_lock, and while it's not held
782 	 * prevent new requests using the table.
783 	 */
784 	ctx->nr_user_bufs = 0;
785 	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
786 	ctx->nr_user_bufs = nr;
787 	if (!ret)
788 		__io_sqe_buffers_unregister(ctx);
789 	return ret;
790 }
791 
792 /*
793  * Not super efficient, but this is just a registration time. And we do cache
794  * the last compound head, so generally we'll only do a full search if we don't
795  * match that one.
796  *
797  * We check if the given compound head page has already been accounted, to
798  * avoid double accounting it. This allows us to account the full size of the
799  * page, not just the constituent pages of a huge page.
800  */
801 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
802 				  int nr_pages, struct page *hpage)
803 {
804 	int i, j;
805 
806 	/* check current page array */
807 	for (i = 0; i < nr_pages; i++) {
808 		if (!PageCompound(pages[i]))
809 			continue;
810 		if (compound_head(pages[i]) == hpage)
811 			return true;
812 	}
813 
814 	/* check previously registered pages */
815 	for (i = 0; i < ctx->nr_user_bufs; i++) {
816 		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
817 
818 		for (j = 0; j < imu->nr_bvecs; j++) {
819 			if (!PageCompound(imu->bvec[j].bv_page))
820 				continue;
821 			if (compound_head(imu->bvec[j].bv_page) == hpage)
822 				return true;
823 		}
824 	}
825 
826 	return false;
827 }
828 
829 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
830 				 int nr_pages, struct io_mapped_ubuf *imu,
831 				 struct page **last_hpage)
832 {
833 	int i, ret;
834 
835 	imu->acct_pages = 0;
836 	for (i = 0; i < nr_pages; i++) {
837 		if (!PageCompound(pages[i])) {
838 			imu->acct_pages++;
839 		} else {
840 			struct page *hpage;
841 
842 			hpage = compound_head(pages[i]);
843 			if (hpage == *last_hpage)
844 				continue;
845 			*last_hpage = hpage;
846 			if (headpage_already_acct(ctx, pages, i, hpage))
847 				continue;
848 			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
849 		}
850 	}
851 
852 	if (!imu->acct_pages)
853 		return 0;
854 
855 	ret = io_account_mem(ctx, imu->acct_pages);
856 	if (ret)
857 		imu->acct_pages = 0;
858 	return ret;
859 }
860 
861 static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
862 				struct io_imu_folio_data *data, int nr_folios)
863 {
864 	struct page **page_array = *pages, **new_array = NULL;
865 	int nr_pages_left = *nr_pages, i, j;
866 
867 	/* Store head pages only*/
868 	new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
869 					GFP_KERNEL);
870 	if (!new_array)
871 		return false;
872 
873 	new_array[0] = compound_head(page_array[0]);
874 	/*
875 	 * The pages are bound to the folio, it doesn't
876 	 * actually unpin them but drops all but one reference,
877 	 * which is usually put down by io_buffer_unmap().
878 	 * Note, needs a better helper.
879 	 */
880 	if (data->nr_pages_head > 1)
881 		unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
882 
883 	j = data->nr_pages_head;
884 	nr_pages_left -= data->nr_pages_head;
885 	for (i = 1; i < nr_folios; i++) {
886 		unsigned int nr_unpin;
887 
888 		new_array[i] = page_array[j];
889 		nr_unpin = min_t(unsigned int, nr_pages_left - 1,
890 					data->nr_pages_mid - 1);
891 		if (nr_unpin)
892 			unpin_user_pages(&page_array[j+1], nr_unpin);
893 		j += data->nr_pages_mid;
894 		nr_pages_left -= data->nr_pages_mid;
895 	}
896 	kvfree(page_array);
897 	*pages = new_array;
898 	*nr_pages = nr_folios;
899 	return true;
900 }
901 
902 static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
903 					 struct io_imu_folio_data *data)
904 {
905 	struct page **page_array = *pages;
906 	struct folio *folio = page_folio(page_array[0]);
907 	unsigned int count = 1, nr_folios = 1;
908 	int i;
909 
910 	if (*nr_pages <= 1)
911 		return false;
912 
913 	data->nr_pages_mid = folio_nr_pages(folio);
914 	if (data->nr_pages_mid == 1)
915 		return false;
916 
917 	data->folio_shift = folio_shift(folio);
918 	/*
919 	 * Check if pages are contiguous inside a folio, and all folios have
920 	 * the same page count except for the head and tail.
921 	 */
922 	for (i = 1; i < *nr_pages; i++) {
923 		if (page_folio(page_array[i]) == folio &&
924 			page_array[i] == page_array[i-1] + 1) {
925 			count++;
926 			continue;
927 		}
928 
929 		if (nr_folios == 1) {
930 			if (folio_page_idx(folio, page_array[i-1]) !=
931 				data->nr_pages_mid - 1)
932 				return false;
933 
934 			data->nr_pages_head = count;
935 		} else if (count != data->nr_pages_mid) {
936 			return false;
937 		}
938 
939 		folio = page_folio(page_array[i]);
940 		if (folio_size(folio) != (1UL << data->folio_shift) ||
941 			folio_page_idx(folio, page_array[i]) != 0)
942 			return false;
943 
944 		count = 1;
945 		nr_folios++;
946 	}
947 	if (nr_folios == 1)
948 		data->nr_pages_head = count;
949 
950 	return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
951 }
952 
953 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
954 				  struct io_mapped_ubuf **pimu,
955 				  struct page **last_hpage)
956 {
957 	struct io_mapped_ubuf *imu = NULL;
958 	struct page **pages = NULL;
959 	unsigned long off;
960 	size_t size;
961 	int ret, nr_pages, i;
962 	struct io_imu_folio_data data;
963 	bool coalesced;
964 
965 	*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
966 	if (!iov->iov_base)
967 		return 0;
968 
969 	ret = -ENOMEM;
970 	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
971 				&nr_pages);
972 	if (IS_ERR(pages)) {
973 		ret = PTR_ERR(pages);
974 		pages = NULL;
975 		goto done;
976 	}
977 
978 	/* If it's huge page(s), try to coalesce them into fewer bvec entries */
979 	coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
980 
981 	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
982 	if (!imu)
983 		goto done;
984 
985 	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
986 	if (ret) {
987 		unpin_user_pages(pages, nr_pages);
988 		goto done;
989 	}
990 
991 	size = iov->iov_len;
992 	/* store original address for later verification */
993 	imu->ubuf = (unsigned long) iov->iov_base;
994 	imu->len = iov->iov_len;
995 	imu->nr_bvecs = nr_pages;
996 	imu->folio_shift = PAGE_SHIFT;
997 	if (coalesced)
998 		imu->folio_shift = data.folio_shift;
999 	refcount_set(&imu->refs, 1);
1000 	off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
1001 	*pimu = imu;
1002 	ret = 0;
1003 
1004 	for (i = 0; i < nr_pages; i++) {
1005 		size_t vec_len;
1006 
1007 		vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
1008 		bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
1009 		off = 0;
1010 		size -= vec_len;
1011 	}
1012 done:
1013 	if (ret)
1014 		kvfree(imu);
1015 	kvfree(pages);
1016 	return ret;
1017 }
1018 
1019 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
1020 {
1021 	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
1022 	return ctx->user_bufs ? 0 : -ENOMEM;
1023 }
1024 
1025 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
1026 			    unsigned int nr_args, u64 __user *tags)
1027 {
1028 	struct page *last_hpage = NULL;
1029 	struct io_rsrc_data *data;
1030 	struct iovec fast_iov, *iov = &fast_iov;
1031 	const struct iovec __user *uvec;
1032 	int i, ret;
1033 
1034 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
1035 
1036 	if (ctx->user_bufs)
1037 		return -EBUSY;
1038 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
1039 		return -EINVAL;
1040 	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data);
1041 	if (ret)
1042 		return ret;
1043 	ret = io_buffers_map_alloc(ctx, nr_args);
1044 	if (ret) {
1045 		io_rsrc_data_free(data);
1046 		return ret;
1047 	}
1048 
1049 	if (!arg)
1050 		memset(iov, 0, sizeof(*iov));
1051 
1052 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
1053 		if (arg) {
1054 			uvec = (struct iovec __user *) arg;
1055 			iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
1056 			if (IS_ERR(iov)) {
1057 				ret = PTR_ERR(iov);
1058 				break;
1059 			}
1060 			ret = io_buffer_validate(iov);
1061 			if (ret)
1062 				break;
1063 			if (ctx->compat)
1064 				arg += sizeof(struct compat_iovec);
1065 			else
1066 				arg += sizeof(struct iovec);
1067 		}
1068 
1069 		if (!iov->iov_base && *io_get_tag_slot(data, i)) {
1070 			ret = -EINVAL;
1071 			break;
1072 		}
1073 
1074 		ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i],
1075 					     &last_hpage);
1076 		if (ret)
1077 			break;
1078 	}
1079 
1080 	WARN_ON_ONCE(ctx->buf_data);
1081 
1082 	ctx->buf_data = data;
1083 	if (ret)
1084 		__io_sqe_buffers_unregister(ctx);
1085 	return ret;
1086 }
1087 
1088 int io_import_fixed(int ddir, struct iov_iter *iter,
1089 			   struct io_mapped_ubuf *imu,
1090 			   u64 buf_addr, size_t len)
1091 {
1092 	u64 buf_end;
1093 	size_t offset;
1094 
1095 	if (WARN_ON_ONCE(!imu))
1096 		return -EFAULT;
1097 	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1098 		return -EFAULT;
1099 	/* not inside the mapped region */
1100 	if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
1101 		return -EFAULT;
1102 
1103 	/*
1104 	 * Might not be a start of buffer, set size appropriately
1105 	 * and advance us to the beginning.
1106 	 */
1107 	offset = buf_addr - imu->ubuf;
1108 	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
1109 
1110 	if (offset) {
1111 		/*
1112 		 * Don't use iov_iter_advance() here, as it's really slow for
1113 		 * using the latter parts of a big fixed buffer - it iterates
1114 		 * over each segment manually. We can cheat a bit here, because
1115 		 * we know that:
1116 		 *
1117 		 * 1) it's a BVEC iter, we set it up
1118 		 * 2) all bvecs are the same in size, except potentially the
1119 		 *    first and last bvec
1120 		 *
1121 		 * So just find our index, and adjust the iterator afterwards.
1122 		 * If the offset is within the first bvec (or the whole first
1123 		 * bvec, just use iov_iter_advance(). This makes it easier
1124 		 * since we can just skip the first segment, which may not
1125 		 * be folio_size aligned.
1126 		 */
1127 		const struct bio_vec *bvec = imu->bvec;
1128 
1129 		if (offset < bvec->bv_len) {
1130 			iter->bvec = bvec;
1131 			iter->count -= offset;
1132 			iter->iov_offset = offset;
1133 		} else {
1134 			unsigned long seg_skip;
1135 
1136 			/* skip first vec */
1137 			offset -= bvec->bv_len;
1138 			seg_skip = 1 + (offset >> imu->folio_shift);
1139 
1140 			iter->bvec = bvec + seg_skip;
1141 			iter->nr_segs -= seg_skip;
1142 			iter->count -= bvec->bv_len + offset;
1143 			iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
1144 		}
1145 	}
1146 
1147 	return 0;
1148 }
1149 
1150 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
1151 {
1152 	struct io_mapped_ubuf **user_bufs;
1153 	struct io_rsrc_data *data;
1154 	int i, ret, nbufs;
1155 
1156 	/*
1157 	 * Drop our own lock here. We'll setup the data we need and reference
1158 	 * the source buffers, then re-grab, check, and assign at the end.
1159 	 */
1160 	mutex_unlock(&ctx->uring_lock);
1161 
1162 	mutex_lock(&src_ctx->uring_lock);
1163 	ret = -ENXIO;
1164 	nbufs = src_ctx->nr_user_bufs;
1165 	if (!nbufs)
1166 		goto out_unlock;
1167 	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data);
1168 	if (ret)
1169 		goto out_unlock;
1170 
1171 	ret = -ENOMEM;
1172 	user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL);
1173 	if (!user_bufs)
1174 		goto out_free_data;
1175 
1176 	for (i = 0; i < nbufs; i++) {
1177 		struct io_mapped_ubuf *src = src_ctx->user_bufs[i];
1178 
1179 		refcount_inc(&src->refs);
1180 		user_bufs[i] = src;
1181 	}
1182 
1183 	/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
1184 	mutex_unlock(&src_ctx->uring_lock);
1185 	mutex_lock(&ctx->uring_lock);
1186 	if (!ctx->user_bufs) {
1187 		ctx->user_bufs = user_bufs;
1188 		ctx->buf_data = data;
1189 		ctx->nr_user_bufs = nbufs;
1190 		return 0;
1191 	}
1192 
1193 	/* someone raced setting up buffers, dump ours */
1194 	for (i = 0; i < nbufs; i++)
1195 		io_buffer_unmap(ctx, &user_bufs[i]);
1196 	io_rsrc_data_free(data);
1197 	kfree(user_bufs);
1198 	return -EBUSY;
1199 out_free_data:
1200 	io_rsrc_data_free(data);
1201 out_unlock:
1202 	mutex_unlock(&src_ctx->uring_lock);
1203 	mutex_lock(&ctx->uring_lock);
1204 	return ret;
1205 }
1206 
1207 /*
1208  * Copy the registered buffers from the source ring whose file descriptor
1209  * is given in the src_fd to the current ring. This is identical to registering
1210  * the buffers with ctx, except faster as mappings already exist.
1211  *
1212  * Since the memory is already accounted once, don't account it again.
1213  */
1214 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
1215 {
1216 	struct io_uring_clone_buffers buf;
1217 	bool registered_src;
1218 	struct file *file;
1219 	int ret;
1220 
1221 	if (ctx->user_bufs || ctx->nr_user_bufs)
1222 		return -EBUSY;
1223 	if (copy_from_user(&buf, arg, sizeof(buf)))
1224 		return -EFAULT;
1225 	if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED)
1226 		return -EINVAL;
1227 	if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
1228 		return -EINVAL;
1229 
1230 	registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
1231 	file = io_uring_register_get_file(buf.src_fd, registered_src);
1232 	if (IS_ERR(file))
1233 		return PTR_ERR(file);
1234 	ret = io_clone_buffers(ctx, file->private_data);
1235 	if (!registered_src)
1236 		fput(file);
1237 	return ret;
1238 }
1239