xref: /linux/io_uring/register.c (revision cdd30ebb1b9f36159d66f088b61aee264e649d7a)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30 #include "eventfd.h"
31 #include "msg_ring.h"
32 #include "memmap.h"
33 
34 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
35 				 IORING_REGISTER_LAST + IORING_OP_LAST)
36 
37 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
38 			   unsigned nr_args)
39 {
40 	struct io_uring_probe *p;
41 	size_t size;
42 	int i, ret;
43 
44 	if (nr_args > IORING_OP_LAST)
45 		nr_args = IORING_OP_LAST;
46 
47 	size = struct_size(p, ops, nr_args);
48 	p = kzalloc(size, GFP_KERNEL);
49 	if (!p)
50 		return -ENOMEM;
51 
52 	ret = -EFAULT;
53 	if (copy_from_user(p, arg, size))
54 		goto out;
55 	ret = -EINVAL;
56 	if (memchr_inv(p, 0, size))
57 		goto out;
58 
59 	p->last_op = IORING_OP_LAST - 1;
60 
61 	for (i = 0; i < nr_args; i++) {
62 		p->ops[i].op = i;
63 		if (io_uring_op_supported(i))
64 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
65 	}
66 	p->ops_len = i;
67 
68 	ret = 0;
69 	if (copy_to_user(arg, p, size))
70 		ret = -EFAULT;
71 out:
72 	kfree(p);
73 	return ret;
74 }
75 
76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
77 {
78 	const struct cred *creds;
79 
80 	creds = xa_erase(&ctx->personalities, id);
81 	if (creds) {
82 		put_cred(creds);
83 		return 0;
84 	}
85 
86 	return -EINVAL;
87 }
88 
89 
90 static int io_register_personality(struct io_ring_ctx *ctx)
91 {
92 	const struct cred *creds;
93 	u32 id;
94 	int ret;
95 
96 	creds = get_current_cred();
97 
98 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
99 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
100 	if (ret < 0) {
101 		put_cred(creds);
102 		return ret;
103 	}
104 	return id;
105 }
106 
107 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
108 					   void __user *arg, unsigned int nr_args)
109 {
110 	struct io_uring_restriction *res;
111 	size_t size;
112 	int i, ret;
113 
114 	/* Restrictions allowed only if rings started disabled */
115 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
116 		return -EBADFD;
117 
118 	/* We allow only a single restrictions registration */
119 	if (ctx->restrictions.registered)
120 		return -EBUSY;
121 
122 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
123 		return -EINVAL;
124 
125 	size = array_size(nr_args, sizeof(*res));
126 	if (size == SIZE_MAX)
127 		return -EOVERFLOW;
128 
129 	res = memdup_user(arg, size);
130 	if (IS_ERR(res))
131 		return PTR_ERR(res);
132 
133 	ret = 0;
134 
135 	for (i = 0; i < nr_args; i++) {
136 		switch (res[i].opcode) {
137 		case IORING_RESTRICTION_REGISTER_OP:
138 			if (res[i].register_op >= IORING_REGISTER_LAST) {
139 				ret = -EINVAL;
140 				goto out;
141 			}
142 
143 			__set_bit(res[i].register_op,
144 				  ctx->restrictions.register_op);
145 			break;
146 		case IORING_RESTRICTION_SQE_OP:
147 			if (res[i].sqe_op >= IORING_OP_LAST) {
148 				ret = -EINVAL;
149 				goto out;
150 			}
151 
152 			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
153 			break;
154 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
155 			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
156 			break;
157 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
158 			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
159 			break;
160 		default:
161 			ret = -EINVAL;
162 			goto out;
163 		}
164 	}
165 
166 out:
167 	/* Reset all restrictions if an error happened */
168 	if (ret != 0)
169 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
170 	else
171 		ctx->restrictions.registered = true;
172 
173 	kfree(res);
174 	return ret;
175 }
176 
177 static int io_register_enable_rings(struct io_ring_ctx *ctx)
178 {
179 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
180 		return -EBADFD;
181 
182 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
183 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
184 		/*
185 		 * Lazy activation attempts would fail if it was polled before
186 		 * submitter_task is set.
187 		 */
188 		if (wq_has_sleeper(&ctx->poll_wq))
189 			io_activate_pollwq(ctx);
190 	}
191 
192 	if (ctx->restrictions.registered)
193 		ctx->restricted = 1;
194 
195 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
196 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
197 		wake_up(&ctx->sq_data->wait);
198 	return 0;
199 }
200 
201 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
202 					 cpumask_var_t new_mask)
203 {
204 	int ret;
205 
206 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
207 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
208 	} else {
209 		mutex_unlock(&ctx->uring_lock);
210 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
211 		mutex_lock(&ctx->uring_lock);
212 	}
213 
214 	return ret;
215 }
216 
217 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
218 				       void __user *arg, unsigned len)
219 {
220 	cpumask_var_t new_mask;
221 	int ret;
222 
223 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
224 		return -ENOMEM;
225 
226 	cpumask_clear(new_mask);
227 	if (len > cpumask_size())
228 		len = cpumask_size();
229 
230 #ifdef CONFIG_COMPAT
231 	if (in_compat_syscall())
232 		ret = compat_get_bitmap(cpumask_bits(new_mask),
233 					(const compat_ulong_t __user *)arg,
234 					len * 8 /* CHAR_BIT */);
235 	else
236 #endif
237 		ret = copy_from_user(new_mask, arg, len);
238 
239 	if (ret) {
240 		free_cpumask_var(new_mask);
241 		return -EFAULT;
242 	}
243 
244 	ret = __io_register_iowq_aff(ctx, new_mask);
245 	free_cpumask_var(new_mask);
246 	return ret;
247 }
248 
249 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
250 {
251 	return __io_register_iowq_aff(ctx, NULL);
252 }
253 
254 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
255 					       void __user *arg)
256 	__must_hold(&ctx->uring_lock)
257 {
258 	struct io_tctx_node *node;
259 	struct io_uring_task *tctx = NULL;
260 	struct io_sq_data *sqd = NULL;
261 	__u32 new_count[2];
262 	int i, ret;
263 
264 	if (copy_from_user(new_count, arg, sizeof(new_count)))
265 		return -EFAULT;
266 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
267 		if (new_count[i] > INT_MAX)
268 			return -EINVAL;
269 
270 	if (ctx->flags & IORING_SETUP_SQPOLL) {
271 		sqd = ctx->sq_data;
272 		if (sqd) {
273 			/*
274 			 * Observe the correct sqd->lock -> ctx->uring_lock
275 			 * ordering. Fine to drop uring_lock here, we hold
276 			 * a ref to the ctx.
277 			 */
278 			refcount_inc(&sqd->refs);
279 			mutex_unlock(&ctx->uring_lock);
280 			mutex_lock(&sqd->lock);
281 			mutex_lock(&ctx->uring_lock);
282 			if (sqd->thread)
283 				tctx = sqd->thread->io_uring;
284 		}
285 	} else {
286 		tctx = current->io_uring;
287 	}
288 
289 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
290 
291 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
292 		if (new_count[i])
293 			ctx->iowq_limits[i] = new_count[i];
294 	ctx->iowq_limits_set = true;
295 
296 	if (tctx && tctx->io_wq) {
297 		ret = io_wq_max_workers(tctx->io_wq, new_count);
298 		if (ret)
299 			goto err;
300 	} else {
301 		memset(new_count, 0, sizeof(new_count));
302 	}
303 
304 	if (sqd) {
305 		mutex_unlock(&ctx->uring_lock);
306 		mutex_unlock(&sqd->lock);
307 		io_put_sq_data(sqd);
308 		mutex_lock(&ctx->uring_lock);
309 	}
310 
311 	if (copy_to_user(arg, new_count, sizeof(new_count)))
312 		return -EFAULT;
313 
314 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
315 	if (sqd)
316 		return 0;
317 
318 	/* now propagate the restriction to all registered users */
319 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
320 		tctx = node->task->io_uring;
321 		if (WARN_ON_ONCE(!tctx->io_wq))
322 			continue;
323 
324 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
325 			new_count[i] = ctx->iowq_limits[i];
326 		/* ignore errors, it always returns zero anyway */
327 		(void)io_wq_max_workers(tctx->io_wq, new_count);
328 	}
329 	return 0;
330 err:
331 	if (sqd) {
332 		mutex_unlock(&ctx->uring_lock);
333 		mutex_unlock(&sqd->lock);
334 		io_put_sq_data(sqd);
335 		mutex_lock(&ctx->uring_lock);
336 	}
337 	return ret;
338 }
339 
340 static int io_register_clock(struct io_ring_ctx *ctx,
341 			     struct io_uring_clock_register __user *arg)
342 {
343 	struct io_uring_clock_register reg;
344 
345 	if (copy_from_user(&reg, arg, sizeof(reg)))
346 		return -EFAULT;
347 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
348 		return -EINVAL;
349 
350 	switch (reg.clockid) {
351 	case CLOCK_MONOTONIC:
352 		ctx->clock_offset = 0;
353 		break;
354 	case CLOCK_BOOTTIME:
355 		ctx->clock_offset = TK_OFFS_BOOT;
356 		break;
357 	default:
358 		return -EINVAL;
359 	}
360 
361 	ctx->clockid = reg.clockid;
362 	return 0;
363 }
364 
365 /*
366  * State to maintain until we can swap. Both new and old state, used for
367  * either mapping or freeing.
368  */
369 struct io_ring_ctx_rings {
370 	unsigned short n_ring_pages;
371 	unsigned short n_sqe_pages;
372 	struct page **ring_pages;
373 	struct page **sqe_pages;
374 	struct io_uring_sqe *sq_sqes;
375 	struct io_rings *rings;
376 };
377 
378 static void io_register_free_rings(struct io_uring_params *p,
379 				   struct io_ring_ctx_rings *r)
380 {
381 	if (!(p->flags & IORING_SETUP_NO_MMAP)) {
382 		io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
383 				true);
384 		io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
385 				true);
386 	} else {
387 		io_pages_free(&r->ring_pages, r->n_ring_pages);
388 		io_pages_free(&r->sqe_pages, r->n_sqe_pages);
389 		vunmap(r->rings);
390 		vunmap(r->sq_sqes);
391 	}
392 }
393 
394 #define swap_old(ctx, o, n, field)		\
395 	do {					\
396 		(o).field = (ctx)->field;	\
397 		(ctx)->field = (n).field;	\
398 	} while (0)
399 
400 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
401 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
402 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
403 
404 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
405 {
406 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
407 	size_t size, sq_array_offset;
408 	struct io_uring_params p;
409 	unsigned i, tail;
410 	void *ptr;
411 	int ret;
412 
413 	/* for single issuer, must be owner resizing */
414 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
415 	    current != ctx->submitter_task)
416 		return -EEXIST;
417 	if (copy_from_user(&p, arg, sizeof(p)))
418 		return -EFAULT;
419 	if (p.flags & ~RESIZE_FLAGS)
420 		return -EINVAL;
421 
422 	/* properties that are always inherited */
423 	p.flags |= (ctx->flags & COPY_FLAGS);
424 
425 	ret = io_uring_fill_params(p.sq_entries, &p);
426 	if (unlikely(ret))
427 		return ret;
428 
429 	/* nothing to do, but copy params back */
430 	if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
431 		if (copy_to_user(arg, &p, sizeof(p)))
432 			return -EFAULT;
433 		return 0;
434 	}
435 
436 	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
437 				&sq_array_offset);
438 	if (size == SIZE_MAX)
439 		return -EOVERFLOW;
440 
441 	if (!(p.flags & IORING_SETUP_NO_MMAP))
442 		n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
443 	else
444 		n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
445 						p.cq_off.user_addr, size);
446 	if (IS_ERR(n.rings))
447 		return PTR_ERR(n.rings);
448 
449 	n.rings->sq_ring_mask = p.sq_entries - 1;
450 	n.rings->cq_ring_mask = p.cq_entries - 1;
451 	n.rings->sq_ring_entries = p.sq_entries;
452 	n.rings->cq_ring_entries = p.cq_entries;
453 
454 	if (copy_to_user(arg, &p, sizeof(p))) {
455 		io_register_free_rings(&p, &n);
456 		return -EFAULT;
457 	}
458 
459 	if (p.flags & IORING_SETUP_SQE128)
460 		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
461 	else
462 		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
463 	if (size == SIZE_MAX) {
464 		io_register_free_rings(&p, &n);
465 		return -EOVERFLOW;
466 	}
467 
468 	if (!(p.flags & IORING_SETUP_NO_MMAP))
469 		ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
470 	else
471 		ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
472 					p.sq_off.user_addr,
473 					size);
474 	if (IS_ERR(ptr)) {
475 		io_register_free_rings(&p, &n);
476 		return PTR_ERR(ptr);
477 	}
478 
479 	/*
480 	 * If using SQPOLL, park the thread
481 	 */
482 	if (ctx->sq_data) {
483 		mutex_unlock(&ctx->uring_lock);
484 		io_sq_thread_park(ctx->sq_data);
485 		mutex_lock(&ctx->uring_lock);
486 	}
487 
488 	/*
489 	 * We'll do the swap. Grab the ctx->resize_lock, which will exclude
490 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
491 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
492 	 * existing rings beyond this point will fail. Not that it could proceed
493 	 * at this point anyway, as the io_uring mmap side needs go grab the
494 	 * ctx->resize_lock as well. Likewise, hold the completion lock over the
495 	 * duration of the actual swap.
496 	 */
497 	mutex_lock(&ctx->resize_lock);
498 	spin_lock(&ctx->completion_lock);
499 	o.rings = ctx->rings;
500 	ctx->rings = NULL;
501 	o.sq_sqes = ctx->sq_sqes;
502 	ctx->sq_sqes = NULL;
503 
504 	/*
505 	 * Now copy SQ and CQ entries, if any. If either of the destination
506 	 * rings can't hold what is already there, then fail the operation.
507 	 */
508 	n.sq_sqes = ptr;
509 	tail = o.rings->sq.tail;
510 	if (tail - o.rings->sq.head > p.sq_entries)
511 		goto overflow;
512 	for (i = o.rings->sq.head; i < tail; i++) {
513 		unsigned src_head = i & (ctx->sq_entries - 1);
514 		unsigned dst_head = i & n.rings->sq_ring_mask;
515 
516 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
517 	}
518 	n.rings->sq.head = o.rings->sq.head;
519 	n.rings->sq.tail = o.rings->sq.tail;
520 
521 	tail = o.rings->cq.tail;
522 	if (tail - o.rings->cq.head > p.cq_entries) {
523 overflow:
524 		/* restore old rings, and return -EOVERFLOW via cleanup path */
525 		ctx->rings = o.rings;
526 		ctx->sq_sqes = o.sq_sqes;
527 		to_free = &n;
528 		ret = -EOVERFLOW;
529 		goto out;
530 	}
531 	for (i = o.rings->cq.head; i < tail; i++) {
532 		unsigned src_head = i & (ctx->cq_entries - 1);
533 		unsigned dst_head = i & n.rings->cq_ring_mask;
534 
535 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
536 	}
537 	n.rings->cq.head = o.rings->cq.head;
538 	n.rings->cq.tail = o.rings->cq.tail;
539 	/* invalidate cached cqe refill */
540 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
541 
542 	n.rings->sq_dropped = o.rings->sq_dropped;
543 	n.rings->sq_flags = o.rings->sq_flags;
544 	n.rings->cq_flags = o.rings->cq_flags;
545 	n.rings->cq_overflow = o.rings->cq_overflow;
546 
547 	/* all done, store old pointers and assign new ones */
548 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
549 		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
550 
551 	ctx->sq_entries = p.sq_entries;
552 	ctx->cq_entries = p.cq_entries;
553 
554 	ctx->rings = n.rings;
555 	ctx->sq_sqes = n.sq_sqes;
556 	swap_old(ctx, o, n, n_ring_pages);
557 	swap_old(ctx, o, n, n_sqe_pages);
558 	swap_old(ctx, o, n, ring_pages);
559 	swap_old(ctx, o, n, sqe_pages);
560 	to_free = &o;
561 	ret = 0;
562 out:
563 	spin_unlock(&ctx->completion_lock);
564 	mutex_unlock(&ctx->resize_lock);
565 	io_register_free_rings(&p, to_free);
566 
567 	if (ctx->sq_data)
568 		io_sq_thread_unpark(ctx->sq_data);
569 
570 	return ret;
571 }
572 
573 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
574 {
575 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
576 	struct io_uring_mem_region_reg reg;
577 	struct io_uring_region_desc __user *rd_uptr;
578 	struct io_uring_region_desc rd;
579 	int ret;
580 
581 	if (io_region_is_set(&ctx->param_region))
582 		return -EBUSY;
583 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
584 		return -EFAULT;
585 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
586 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
587 		return -EFAULT;
588 
589 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
590 		return -EINVAL;
591 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
592 		return -EINVAL;
593 
594 	/*
595 	 * This ensures there are no waiters. Waiters are unlocked and it's
596 	 * hard to synchronise with them, especially if we need to initialise
597 	 * the region.
598 	 */
599 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
600 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
601 		return -EINVAL;
602 
603 	ret = io_create_region(ctx, &ctx->param_region, &rd);
604 	if (ret)
605 		return ret;
606 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
607 		io_free_region(ctx, &ctx->param_region);
608 		return -EFAULT;
609 	}
610 
611 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
612 		ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
613 		ctx->cq_wait_size = rd.size;
614 	}
615 	return 0;
616 }
617 
618 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
619 			       void __user *arg, unsigned nr_args)
620 	__releases(ctx->uring_lock)
621 	__acquires(ctx->uring_lock)
622 {
623 	int ret;
624 
625 	/*
626 	 * We don't quiesce the refs for register anymore and so it can't be
627 	 * dying as we're holding a file ref here.
628 	 */
629 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
630 		return -ENXIO;
631 
632 	if (ctx->submitter_task && ctx->submitter_task != current)
633 		return -EEXIST;
634 
635 	if (ctx->restricted) {
636 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
637 		if (!test_bit(opcode, ctx->restrictions.register_op))
638 			return -EACCES;
639 	}
640 
641 	switch (opcode) {
642 	case IORING_REGISTER_BUFFERS:
643 		ret = -EFAULT;
644 		if (!arg)
645 			break;
646 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
647 		break;
648 	case IORING_UNREGISTER_BUFFERS:
649 		ret = -EINVAL;
650 		if (arg || nr_args)
651 			break;
652 		ret = io_sqe_buffers_unregister(ctx);
653 		break;
654 	case IORING_REGISTER_FILES:
655 		ret = -EFAULT;
656 		if (!arg)
657 			break;
658 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
659 		break;
660 	case IORING_UNREGISTER_FILES:
661 		ret = -EINVAL;
662 		if (arg || nr_args)
663 			break;
664 		ret = io_sqe_files_unregister(ctx);
665 		break;
666 	case IORING_REGISTER_FILES_UPDATE:
667 		ret = io_register_files_update(ctx, arg, nr_args);
668 		break;
669 	case IORING_REGISTER_EVENTFD:
670 		ret = -EINVAL;
671 		if (nr_args != 1)
672 			break;
673 		ret = io_eventfd_register(ctx, arg, 0);
674 		break;
675 	case IORING_REGISTER_EVENTFD_ASYNC:
676 		ret = -EINVAL;
677 		if (nr_args != 1)
678 			break;
679 		ret = io_eventfd_register(ctx, arg, 1);
680 		break;
681 	case IORING_UNREGISTER_EVENTFD:
682 		ret = -EINVAL;
683 		if (arg || nr_args)
684 			break;
685 		ret = io_eventfd_unregister(ctx);
686 		break;
687 	case IORING_REGISTER_PROBE:
688 		ret = -EINVAL;
689 		if (!arg || nr_args > 256)
690 			break;
691 		ret = io_probe(ctx, arg, nr_args);
692 		break;
693 	case IORING_REGISTER_PERSONALITY:
694 		ret = -EINVAL;
695 		if (arg || nr_args)
696 			break;
697 		ret = io_register_personality(ctx);
698 		break;
699 	case IORING_UNREGISTER_PERSONALITY:
700 		ret = -EINVAL;
701 		if (arg)
702 			break;
703 		ret = io_unregister_personality(ctx, nr_args);
704 		break;
705 	case IORING_REGISTER_ENABLE_RINGS:
706 		ret = -EINVAL;
707 		if (arg || nr_args)
708 			break;
709 		ret = io_register_enable_rings(ctx);
710 		break;
711 	case IORING_REGISTER_RESTRICTIONS:
712 		ret = io_register_restrictions(ctx, arg, nr_args);
713 		break;
714 	case IORING_REGISTER_FILES2:
715 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
716 		break;
717 	case IORING_REGISTER_FILES_UPDATE2:
718 		ret = io_register_rsrc_update(ctx, arg, nr_args,
719 					      IORING_RSRC_FILE);
720 		break;
721 	case IORING_REGISTER_BUFFERS2:
722 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
723 		break;
724 	case IORING_REGISTER_BUFFERS_UPDATE:
725 		ret = io_register_rsrc_update(ctx, arg, nr_args,
726 					      IORING_RSRC_BUFFER);
727 		break;
728 	case IORING_REGISTER_IOWQ_AFF:
729 		ret = -EINVAL;
730 		if (!arg || !nr_args)
731 			break;
732 		ret = io_register_iowq_aff(ctx, arg, nr_args);
733 		break;
734 	case IORING_UNREGISTER_IOWQ_AFF:
735 		ret = -EINVAL;
736 		if (arg || nr_args)
737 			break;
738 		ret = io_unregister_iowq_aff(ctx);
739 		break;
740 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
741 		ret = -EINVAL;
742 		if (!arg || nr_args != 2)
743 			break;
744 		ret = io_register_iowq_max_workers(ctx, arg);
745 		break;
746 	case IORING_REGISTER_RING_FDS:
747 		ret = io_ringfd_register(ctx, arg, nr_args);
748 		break;
749 	case IORING_UNREGISTER_RING_FDS:
750 		ret = io_ringfd_unregister(ctx, arg, nr_args);
751 		break;
752 	case IORING_REGISTER_PBUF_RING:
753 		ret = -EINVAL;
754 		if (!arg || nr_args != 1)
755 			break;
756 		ret = io_register_pbuf_ring(ctx, arg);
757 		break;
758 	case IORING_UNREGISTER_PBUF_RING:
759 		ret = -EINVAL;
760 		if (!arg || nr_args != 1)
761 			break;
762 		ret = io_unregister_pbuf_ring(ctx, arg);
763 		break;
764 	case IORING_REGISTER_SYNC_CANCEL:
765 		ret = -EINVAL;
766 		if (!arg || nr_args != 1)
767 			break;
768 		ret = io_sync_cancel(ctx, arg);
769 		break;
770 	case IORING_REGISTER_FILE_ALLOC_RANGE:
771 		ret = -EINVAL;
772 		if (!arg || nr_args)
773 			break;
774 		ret = io_register_file_alloc_range(ctx, arg);
775 		break;
776 	case IORING_REGISTER_PBUF_STATUS:
777 		ret = -EINVAL;
778 		if (!arg || nr_args != 1)
779 			break;
780 		ret = io_register_pbuf_status(ctx, arg);
781 		break;
782 	case IORING_REGISTER_NAPI:
783 		ret = -EINVAL;
784 		if (!arg || nr_args != 1)
785 			break;
786 		ret = io_register_napi(ctx, arg);
787 		break;
788 	case IORING_UNREGISTER_NAPI:
789 		ret = -EINVAL;
790 		if (nr_args != 1)
791 			break;
792 		ret = io_unregister_napi(ctx, arg);
793 		break;
794 	case IORING_REGISTER_CLOCK:
795 		ret = -EINVAL;
796 		if (!arg || nr_args)
797 			break;
798 		ret = io_register_clock(ctx, arg);
799 		break;
800 	case IORING_REGISTER_CLONE_BUFFERS:
801 		ret = -EINVAL;
802 		if (!arg || nr_args != 1)
803 			break;
804 		ret = io_register_clone_buffers(ctx, arg);
805 		break;
806 	case IORING_REGISTER_RESIZE_RINGS:
807 		ret = -EINVAL;
808 		if (!arg || nr_args != 1)
809 			break;
810 		ret = io_register_resize_rings(ctx, arg);
811 		break;
812 	case IORING_REGISTER_MEM_REGION:
813 		ret = -EINVAL;
814 		if (!arg || nr_args != 1)
815 			break;
816 		ret = io_register_mem_region(ctx, arg);
817 		break;
818 	default:
819 		ret = -EINVAL;
820 		break;
821 	}
822 
823 	return ret;
824 }
825 
826 /*
827  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
828  * true, then the registered index is used. Otherwise, the normal fd table.
829  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
830  */
831 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
832 {
833 	struct file *file;
834 
835 	if (registered) {
836 		/*
837 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
838 		 * need only dereference our task private array to find it.
839 		 */
840 		struct io_uring_task *tctx = current->io_uring;
841 
842 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
843 			return ERR_PTR(-EINVAL);
844 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
845 		file = tctx->registered_rings[fd];
846 	} else {
847 		file = fget(fd);
848 	}
849 
850 	if (unlikely(!file))
851 		return ERR_PTR(-EBADF);
852 	if (io_is_uring_fops(file))
853 		return file;
854 	fput(file);
855 	return ERR_PTR(-EOPNOTSUPP);
856 }
857 
858 /*
859  * "blind" registration opcodes are ones where there's no ring given, and
860  * hence the source fd must be -1.
861  */
862 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
863 				   unsigned int nr_args)
864 {
865 	switch (opcode) {
866 	case IORING_REGISTER_SEND_MSG_RING: {
867 		struct io_uring_sqe sqe;
868 
869 		if (!arg || nr_args != 1)
870 			return -EINVAL;
871 		if (copy_from_user(&sqe, arg, sizeof(sqe)))
872 			return -EFAULT;
873 		/* no flags supported */
874 		if (sqe.flags)
875 			return -EINVAL;
876 		if (sqe.opcode == IORING_OP_MSG_RING)
877 			return io_uring_sync_msg_ring(&sqe);
878 		}
879 	}
880 
881 	return -EINVAL;
882 }
883 
884 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
885 		void __user *, arg, unsigned int, nr_args)
886 {
887 	struct io_ring_ctx *ctx;
888 	long ret = -EBADF;
889 	struct file *file;
890 	bool use_registered_ring;
891 
892 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
893 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
894 
895 	if (opcode >= IORING_REGISTER_LAST)
896 		return -EINVAL;
897 
898 	if (fd == -1)
899 		return io_uring_register_blind(opcode, arg, nr_args);
900 
901 	file = io_uring_register_get_file(fd, use_registered_ring);
902 	if (IS_ERR(file))
903 		return PTR_ERR(file);
904 	ctx = file->private_data;
905 
906 	mutex_lock(&ctx->uring_lock);
907 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
908 
909 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
910 				ctx->buf_table.nr, ret);
911 	mutex_unlock(&ctx->uring_lock);
912 	if (!use_registered_ring)
913 		fput(file);
914 	return ret;
915 }
916