xref: /linux/io_uring/register.c (revision 9c9ce355b1013a7ef37c06007cb8d714eaf4c303)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30 #include "eventfd.h"
31 #include "msg_ring.h"
32 #include "memmap.h"
33 
34 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
35 				 IORING_REGISTER_LAST + IORING_OP_LAST)
36 
37 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
38 			   unsigned nr_args)
39 {
40 	struct io_uring_probe *p;
41 	size_t size;
42 	int i, ret;
43 
44 	if (nr_args > IORING_OP_LAST)
45 		nr_args = IORING_OP_LAST;
46 
47 	size = struct_size(p, ops, nr_args);
48 	p = kzalloc(size, GFP_KERNEL);
49 	if (!p)
50 		return -ENOMEM;
51 
52 	ret = -EFAULT;
53 	if (copy_from_user(p, arg, size))
54 		goto out;
55 	ret = -EINVAL;
56 	if (memchr_inv(p, 0, size))
57 		goto out;
58 
59 	p->last_op = IORING_OP_LAST - 1;
60 
61 	for (i = 0; i < nr_args; i++) {
62 		p->ops[i].op = i;
63 		if (io_uring_op_supported(i))
64 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
65 	}
66 	p->ops_len = i;
67 
68 	ret = 0;
69 	if (copy_to_user(arg, p, size))
70 		ret = -EFAULT;
71 out:
72 	kfree(p);
73 	return ret;
74 }
75 
76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
77 {
78 	const struct cred *creds;
79 
80 	creds = xa_erase(&ctx->personalities, id);
81 	if (creds) {
82 		put_cred(creds);
83 		return 0;
84 	}
85 
86 	return -EINVAL;
87 }
88 
89 
90 static int io_register_personality(struct io_ring_ctx *ctx)
91 {
92 	const struct cred *creds;
93 	u32 id;
94 	int ret;
95 
96 	creds = get_current_cred();
97 
98 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
99 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
100 	if (ret < 0) {
101 		put_cred(creds);
102 		return ret;
103 	}
104 	return id;
105 }
106 
107 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
108 					   void __user *arg, unsigned int nr_args)
109 {
110 	struct io_uring_restriction *res;
111 	size_t size;
112 	int i, ret;
113 
114 	/* Restrictions allowed only if rings started disabled */
115 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
116 		return -EBADFD;
117 
118 	/* We allow only a single restrictions registration */
119 	if (ctx->restrictions.registered)
120 		return -EBUSY;
121 
122 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
123 		return -EINVAL;
124 
125 	size = array_size(nr_args, sizeof(*res));
126 	if (size == SIZE_MAX)
127 		return -EOVERFLOW;
128 
129 	res = memdup_user(arg, size);
130 	if (IS_ERR(res))
131 		return PTR_ERR(res);
132 
133 	ret = 0;
134 
135 	for (i = 0; i < nr_args; i++) {
136 		switch (res[i].opcode) {
137 		case IORING_RESTRICTION_REGISTER_OP:
138 			if (res[i].register_op >= IORING_REGISTER_LAST) {
139 				ret = -EINVAL;
140 				goto out;
141 			}
142 
143 			__set_bit(res[i].register_op,
144 				  ctx->restrictions.register_op);
145 			break;
146 		case IORING_RESTRICTION_SQE_OP:
147 			if (res[i].sqe_op >= IORING_OP_LAST) {
148 				ret = -EINVAL;
149 				goto out;
150 			}
151 
152 			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
153 			break;
154 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
155 			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
156 			break;
157 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
158 			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
159 			break;
160 		default:
161 			ret = -EINVAL;
162 			goto out;
163 		}
164 	}
165 
166 out:
167 	/* Reset all restrictions if an error happened */
168 	if (ret != 0)
169 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
170 	else
171 		ctx->restrictions.registered = true;
172 
173 	kfree(res);
174 	return ret;
175 }
176 
177 static int io_register_enable_rings(struct io_ring_ctx *ctx)
178 {
179 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
180 		return -EBADFD;
181 
182 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
183 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
184 		/*
185 		 * Lazy activation attempts would fail if it was polled before
186 		 * submitter_task is set.
187 		 */
188 		if (wq_has_sleeper(&ctx->poll_wq))
189 			io_activate_pollwq(ctx);
190 	}
191 
192 	if (ctx->restrictions.registered)
193 		ctx->restricted = 1;
194 
195 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
196 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
197 		wake_up(&ctx->sq_data->wait);
198 	return 0;
199 }
200 
201 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
202 					 cpumask_var_t new_mask)
203 {
204 	int ret;
205 
206 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
207 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
208 	} else {
209 		mutex_unlock(&ctx->uring_lock);
210 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
211 		mutex_lock(&ctx->uring_lock);
212 	}
213 
214 	return ret;
215 }
216 
217 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
218 				       void __user *arg, unsigned len)
219 {
220 	cpumask_var_t new_mask;
221 	int ret;
222 
223 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
224 		return -ENOMEM;
225 
226 	cpumask_clear(new_mask);
227 	if (len > cpumask_size())
228 		len = cpumask_size();
229 
230 #ifdef CONFIG_COMPAT
231 	if (in_compat_syscall())
232 		ret = compat_get_bitmap(cpumask_bits(new_mask),
233 					(const compat_ulong_t __user *)arg,
234 					len * 8 /* CHAR_BIT */);
235 	else
236 #endif
237 		ret = copy_from_user(new_mask, arg, len);
238 
239 	if (ret) {
240 		free_cpumask_var(new_mask);
241 		return -EFAULT;
242 	}
243 
244 	ret = __io_register_iowq_aff(ctx, new_mask);
245 	free_cpumask_var(new_mask);
246 	return ret;
247 }
248 
249 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
250 {
251 	return __io_register_iowq_aff(ctx, NULL);
252 }
253 
254 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
255 					       void __user *arg)
256 	__must_hold(&ctx->uring_lock)
257 {
258 	struct io_tctx_node *node;
259 	struct io_uring_task *tctx = NULL;
260 	struct io_sq_data *sqd = NULL;
261 	__u32 new_count[2];
262 	int i, ret;
263 
264 	if (copy_from_user(new_count, arg, sizeof(new_count)))
265 		return -EFAULT;
266 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
267 		if (new_count[i] > INT_MAX)
268 			return -EINVAL;
269 
270 	if (ctx->flags & IORING_SETUP_SQPOLL) {
271 		sqd = ctx->sq_data;
272 		if (sqd) {
273 			/*
274 			 * Observe the correct sqd->lock -> ctx->uring_lock
275 			 * ordering. Fine to drop uring_lock here, we hold
276 			 * a ref to the ctx.
277 			 */
278 			refcount_inc(&sqd->refs);
279 			mutex_unlock(&ctx->uring_lock);
280 			mutex_lock(&sqd->lock);
281 			mutex_lock(&ctx->uring_lock);
282 			if (sqd->thread)
283 				tctx = sqd->thread->io_uring;
284 		}
285 	} else {
286 		tctx = current->io_uring;
287 	}
288 
289 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
290 
291 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
292 		if (new_count[i])
293 			ctx->iowq_limits[i] = new_count[i];
294 	ctx->iowq_limits_set = true;
295 
296 	if (tctx && tctx->io_wq) {
297 		ret = io_wq_max_workers(tctx->io_wq, new_count);
298 		if (ret)
299 			goto err;
300 	} else {
301 		memset(new_count, 0, sizeof(new_count));
302 	}
303 
304 	if (sqd) {
305 		mutex_unlock(&ctx->uring_lock);
306 		mutex_unlock(&sqd->lock);
307 		io_put_sq_data(sqd);
308 		mutex_lock(&ctx->uring_lock);
309 	}
310 
311 	if (copy_to_user(arg, new_count, sizeof(new_count)))
312 		return -EFAULT;
313 
314 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
315 	if (sqd)
316 		return 0;
317 
318 	/* now propagate the restriction to all registered users */
319 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
320 		tctx = node->task->io_uring;
321 		if (WARN_ON_ONCE(!tctx->io_wq))
322 			continue;
323 
324 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
325 			new_count[i] = ctx->iowq_limits[i];
326 		/* ignore errors, it always returns zero anyway */
327 		(void)io_wq_max_workers(tctx->io_wq, new_count);
328 	}
329 	return 0;
330 err:
331 	if (sqd) {
332 		mutex_unlock(&ctx->uring_lock);
333 		mutex_unlock(&sqd->lock);
334 		io_put_sq_data(sqd);
335 		mutex_lock(&ctx->uring_lock);
336 	}
337 	return ret;
338 }
339 
340 static int io_register_clock(struct io_ring_ctx *ctx,
341 			     struct io_uring_clock_register __user *arg)
342 {
343 	struct io_uring_clock_register reg;
344 
345 	if (copy_from_user(&reg, arg, sizeof(reg)))
346 		return -EFAULT;
347 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
348 		return -EINVAL;
349 
350 	switch (reg.clockid) {
351 	case CLOCK_MONOTONIC:
352 		ctx->clock_offset = 0;
353 		break;
354 	case CLOCK_BOOTTIME:
355 		ctx->clock_offset = TK_OFFS_BOOT;
356 		break;
357 	default:
358 		return -EINVAL;
359 	}
360 
361 	ctx->clockid = reg.clockid;
362 	return 0;
363 }
364 
365 /*
366  * State to maintain until we can swap. Both new and old state, used for
367  * either mapping or freeing.
368  */
369 struct io_ring_ctx_rings {
370 	unsigned short n_ring_pages;
371 	unsigned short n_sqe_pages;
372 	struct page **ring_pages;
373 	struct page **sqe_pages;
374 	struct io_uring_sqe *sq_sqes;
375 	struct io_rings *rings;
376 };
377 
378 static void io_register_free_rings(struct io_uring_params *p,
379 				   struct io_ring_ctx_rings *r)
380 {
381 	if (!(p->flags & IORING_SETUP_NO_MMAP)) {
382 		io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
383 				true);
384 		io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
385 				true);
386 	} else {
387 		io_pages_free(&r->ring_pages, r->n_ring_pages);
388 		io_pages_free(&r->sqe_pages, r->n_sqe_pages);
389 		vunmap(r->rings);
390 		vunmap(r->sq_sqes);
391 	}
392 }
393 
394 #define swap_old(ctx, o, n, field)		\
395 	do {					\
396 		(o).field = (ctx)->field;	\
397 		(ctx)->field = (n).field;	\
398 	} while (0)
399 
400 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
401 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
402 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
403 
404 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
405 {
406 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
407 	size_t size, sq_array_offset;
408 	struct io_uring_params p;
409 	unsigned i, tail;
410 	void *ptr;
411 	int ret;
412 
413 	/* for single issuer, must be owner resizing */
414 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
415 	    current != ctx->submitter_task)
416 		return -EEXIST;
417 	/* limited to DEFER_TASKRUN for now */
418 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
419 		return -EINVAL;
420 	if (copy_from_user(&p, arg, sizeof(p)))
421 		return -EFAULT;
422 	if (p.flags & ~RESIZE_FLAGS)
423 		return -EINVAL;
424 
425 	/* properties that are always inherited */
426 	p.flags |= (ctx->flags & COPY_FLAGS);
427 
428 	ret = io_uring_fill_params(p.sq_entries, &p);
429 	if (unlikely(ret))
430 		return ret;
431 
432 	/* nothing to do, but copy params back */
433 	if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
434 		if (copy_to_user(arg, &p, sizeof(p)))
435 			return -EFAULT;
436 		return 0;
437 	}
438 
439 	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
440 				&sq_array_offset);
441 	if (size == SIZE_MAX)
442 		return -EOVERFLOW;
443 
444 	if (!(p.flags & IORING_SETUP_NO_MMAP))
445 		n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
446 	else
447 		n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
448 						p.cq_off.user_addr, size);
449 	if (IS_ERR(n.rings))
450 		return PTR_ERR(n.rings);
451 
452 	n.rings->sq_ring_mask = p.sq_entries - 1;
453 	n.rings->cq_ring_mask = p.cq_entries - 1;
454 	n.rings->sq_ring_entries = p.sq_entries;
455 	n.rings->cq_ring_entries = p.cq_entries;
456 
457 	if (copy_to_user(arg, &p, sizeof(p))) {
458 		io_register_free_rings(&p, &n);
459 		return -EFAULT;
460 	}
461 
462 	if (p.flags & IORING_SETUP_SQE128)
463 		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
464 	else
465 		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
466 	if (size == SIZE_MAX) {
467 		io_register_free_rings(&p, &n);
468 		return -EOVERFLOW;
469 	}
470 
471 	if (!(p.flags & IORING_SETUP_NO_MMAP))
472 		ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
473 	else
474 		ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
475 					p.sq_off.user_addr,
476 					size);
477 	if (IS_ERR(ptr)) {
478 		io_register_free_rings(&p, &n);
479 		return PTR_ERR(ptr);
480 	}
481 
482 	/*
483 	 * If using SQPOLL, park the thread
484 	 */
485 	if (ctx->sq_data) {
486 		mutex_unlock(&ctx->uring_lock);
487 		io_sq_thread_park(ctx->sq_data);
488 		mutex_lock(&ctx->uring_lock);
489 	}
490 
491 	/*
492 	 * We'll do the swap. Grab the ctx->resize_lock, which will exclude
493 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
494 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
495 	 * existing rings beyond this point will fail. Not that it could proceed
496 	 * at this point anyway, as the io_uring mmap side needs go grab the
497 	 * ctx->resize_lock as well. Likewise, hold the completion lock over the
498 	 * duration of the actual swap.
499 	 */
500 	mutex_lock(&ctx->resize_lock);
501 	spin_lock(&ctx->completion_lock);
502 	o.rings = ctx->rings;
503 	ctx->rings = NULL;
504 	o.sq_sqes = ctx->sq_sqes;
505 	ctx->sq_sqes = NULL;
506 
507 	/*
508 	 * Now copy SQ and CQ entries, if any. If either of the destination
509 	 * rings can't hold what is already there, then fail the operation.
510 	 */
511 	n.sq_sqes = ptr;
512 	tail = o.rings->sq.tail;
513 	if (tail - o.rings->sq.head > p.sq_entries)
514 		goto overflow;
515 	for (i = o.rings->sq.head; i < tail; i++) {
516 		unsigned src_head = i & (ctx->sq_entries - 1);
517 		unsigned dst_head = i & n.rings->sq_ring_mask;
518 
519 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
520 	}
521 	n.rings->sq.head = o.rings->sq.head;
522 	n.rings->sq.tail = o.rings->sq.tail;
523 
524 	tail = o.rings->cq.tail;
525 	if (tail - o.rings->cq.head > p.cq_entries) {
526 overflow:
527 		/* restore old rings, and return -EOVERFLOW via cleanup path */
528 		ctx->rings = o.rings;
529 		ctx->sq_sqes = o.sq_sqes;
530 		to_free = &n;
531 		ret = -EOVERFLOW;
532 		goto out;
533 	}
534 	for (i = o.rings->cq.head; i < tail; i++) {
535 		unsigned src_head = i & (ctx->cq_entries - 1);
536 		unsigned dst_head = i & n.rings->cq_ring_mask;
537 
538 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
539 	}
540 	n.rings->cq.head = o.rings->cq.head;
541 	n.rings->cq.tail = o.rings->cq.tail;
542 	/* invalidate cached cqe refill */
543 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
544 
545 	n.rings->sq_dropped = o.rings->sq_dropped;
546 	n.rings->sq_flags = o.rings->sq_flags;
547 	n.rings->cq_flags = o.rings->cq_flags;
548 	n.rings->cq_overflow = o.rings->cq_overflow;
549 
550 	/* all done, store old pointers and assign new ones */
551 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
552 		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
553 
554 	ctx->sq_entries = p.sq_entries;
555 	ctx->cq_entries = p.cq_entries;
556 
557 	ctx->rings = n.rings;
558 	ctx->sq_sqes = n.sq_sqes;
559 	swap_old(ctx, o, n, n_ring_pages);
560 	swap_old(ctx, o, n, n_sqe_pages);
561 	swap_old(ctx, o, n, ring_pages);
562 	swap_old(ctx, o, n, sqe_pages);
563 	to_free = &o;
564 	ret = 0;
565 out:
566 	spin_unlock(&ctx->completion_lock);
567 	mutex_unlock(&ctx->resize_lock);
568 	io_register_free_rings(&p, to_free);
569 
570 	if (ctx->sq_data)
571 		io_sq_thread_unpark(ctx->sq_data);
572 
573 	return ret;
574 }
575 
576 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
577 {
578 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
579 	struct io_uring_mem_region_reg reg;
580 	struct io_uring_region_desc __user *rd_uptr;
581 	struct io_uring_region_desc rd;
582 	int ret;
583 
584 	if (io_region_is_set(&ctx->param_region))
585 		return -EBUSY;
586 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
587 		return -EFAULT;
588 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
589 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
590 		return -EFAULT;
591 
592 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
593 		return -EINVAL;
594 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
595 		return -EINVAL;
596 
597 	/*
598 	 * This ensures there are no waiters. Waiters are unlocked and it's
599 	 * hard to synchronise with them, especially if we need to initialise
600 	 * the region.
601 	 */
602 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
603 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
604 		return -EINVAL;
605 
606 	ret = io_create_region(ctx, &ctx->param_region, &rd);
607 	if (ret)
608 		return ret;
609 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
610 		io_free_region(ctx, &ctx->param_region);
611 		return -EFAULT;
612 	}
613 
614 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
615 		ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
616 		ctx->cq_wait_size = rd.size;
617 	}
618 	return 0;
619 }
620 
621 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
622 			       void __user *arg, unsigned nr_args)
623 	__releases(ctx->uring_lock)
624 	__acquires(ctx->uring_lock)
625 {
626 	int ret;
627 
628 	/*
629 	 * We don't quiesce the refs for register anymore and so it can't be
630 	 * dying as we're holding a file ref here.
631 	 */
632 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
633 		return -ENXIO;
634 
635 	if (ctx->submitter_task && ctx->submitter_task != current)
636 		return -EEXIST;
637 
638 	if (ctx->restricted) {
639 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
640 		if (!test_bit(opcode, ctx->restrictions.register_op))
641 			return -EACCES;
642 	}
643 
644 	switch (opcode) {
645 	case IORING_REGISTER_BUFFERS:
646 		ret = -EFAULT;
647 		if (!arg)
648 			break;
649 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
650 		break;
651 	case IORING_UNREGISTER_BUFFERS:
652 		ret = -EINVAL;
653 		if (arg || nr_args)
654 			break;
655 		ret = io_sqe_buffers_unregister(ctx);
656 		break;
657 	case IORING_REGISTER_FILES:
658 		ret = -EFAULT;
659 		if (!arg)
660 			break;
661 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
662 		break;
663 	case IORING_UNREGISTER_FILES:
664 		ret = -EINVAL;
665 		if (arg || nr_args)
666 			break;
667 		ret = io_sqe_files_unregister(ctx);
668 		break;
669 	case IORING_REGISTER_FILES_UPDATE:
670 		ret = io_register_files_update(ctx, arg, nr_args);
671 		break;
672 	case IORING_REGISTER_EVENTFD:
673 		ret = -EINVAL;
674 		if (nr_args != 1)
675 			break;
676 		ret = io_eventfd_register(ctx, arg, 0);
677 		break;
678 	case IORING_REGISTER_EVENTFD_ASYNC:
679 		ret = -EINVAL;
680 		if (nr_args != 1)
681 			break;
682 		ret = io_eventfd_register(ctx, arg, 1);
683 		break;
684 	case IORING_UNREGISTER_EVENTFD:
685 		ret = -EINVAL;
686 		if (arg || nr_args)
687 			break;
688 		ret = io_eventfd_unregister(ctx);
689 		break;
690 	case IORING_REGISTER_PROBE:
691 		ret = -EINVAL;
692 		if (!arg || nr_args > 256)
693 			break;
694 		ret = io_probe(ctx, arg, nr_args);
695 		break;
696 	case IORING_REGISTER_PERSONALITY:
697 		ret = -EINVAL;
698 		if (arg || nr_args)
699 			break;
700 		ret = io_register_personality(ctx);
701 		break;
702 	case IORING_UNREGISTER_PERSONALITY:
703 		ret = -EINVAL;
704 		if (arg)
705 			break;
706 		ret = io_unregister_personality(ctx, nr_args);
707 		break;
708 	case IORING_REGISTER_ENABLE_RINGS:
709 		ret = -EINVAL;
710 		if (arg || nr_args)
711 			break;
712 		ret = io_register_enable_rings(ctx);
713 		break;
714 	case IORING_REGISTER_RESTRICTIONS:
715 		ret = io_register_restrictions(ctx, arg, nr_args);
716 		break;
717 	case IORING_REGISTER_FILES2:
718 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
719 		break;
720 	case IORING_REGISTER_FILES_UPDATE2:
721 		ret = io_register_rsrc_update(ctx, arg, nr_args,
722 					      IORING_RSRC_FILE);
723 		break;
724 	case IORING_REGISTER_BUFFERS2:
725 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
726 		break;
727 	case IORING_REGISTER_BUFFERS_UPDATE:
728 		ret = io_register_rsrc_update(ctx, arg, nr_args,
729 					      IORING_RSRC_BUFFER);
730 		break;
731 	case IORING_REGISTER_IOWQ_AFF:
732 		ret = -EINVAL;
733 		if (!arg || !nr_args)
734 			break;
735 		ret = io_register_iowq_aff(ctx, arg, nr_args);
736 		break;
737 	case IORING_UNREGISTER_IOWQ_AFF:
738 		ret = -EINVAL;
739 		if (arg || nr_args)
740 			break;
741 		ret = io_unregister_iowq_aff(ctx);
742 		break;
743 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
744 		ret = -EINVAL;
745 		if (!arg || nr_args != 2)
746 			break;
747 		ret = io_register_iowq_max_workers(ctx, arg);
748 		break;
749 	case IORING_REGISTER_RING_FDS:
750 		ret = io_ringfd_register(ctx, arg, nr_args);
751 		break;
752 	case IORING_UNREGISTER_RING_FDS:
753 		ret = io_ringfd_unregister(ctx, arg, nr_args);
754 		break;
755 	case IORING_REGISTER_PBUF_RING:
756 		ret = -EINVAL;
757 		if (!arg || nr_args != 1)
758 			break;
759 		ret = io_register_pbuf_ring(ctx, arg);
760 		break;
761 	case IORING_UNREGISTER_PBUF_RING:
762 		ret = -EINVAL;
763 		if (!arg || nr_args != 1)
764 			break;
765 		ret = io_unregister_pbuf_ring(ctx, arg);
766 		break;
767 	case IORING_REGISTER_SYNC_CANCEL:
768 		ret = -EINVAL;
769 		if (!arg || nr_args != 1)
770 			break;
771 		ret = io_sync_cancel(ctx, arg);
772 		break;
773 	case IORING_REGISTER_FILE_ALLOC_RANGE:
774 		ret = -EINVAL;
775 		if (!arg || nr_args)
776 			break;
777 		ret = io_register_file_alloc_range(ctx, arg);
778 		break;
779 	case IORING_REGISTER_PBUF_STATUS:
780 		ret = -EINVAL;
781 		if (!arg || nr_args != 1)
782 			break;
783 		ret = io_register_pbuf_status(ctx, arg);
784 		break;
785 	case IORING_REGISTER_NAPI:
786 		ret = -EINVAL;
787 		if (!arg || nr_args != 1)
788 			break;
789 		ret = io_register_napi(ctx, arg);
790 		break;
791 	case IORING_UNREGISTER_NAPI:
792 		ret = -EINVAL;
793 		if (nr_args != 1)
794 			break;
795 		ret = io_unregister_napi(ctx, arg);
796 		break;
797 	case IORING_REGISTER_CLOCK:
798 		ret = -EINVAL;
799 		if (!arg || nr_args)
800 			break;
801 		ret = io_register_clock(ctx, arg);
802 		break;
803 	case IORING_REGISTER_CLONE_BUFFERS:
804 		ret = -EINVAL;
805 		if (!arg || nr_args != 1)
806 			break;
807 		ret = io_register_clone_buffers(ctx, arg);
808 		break;
809 	case IORING_REGISTER_RESIZE_RINGS:
810 		ret = -EINVAL;
811 		if (!arg || nr_args != 1)
812 			break;
813 		ret = io_register_resize_rings(ctx, arg);
814 		break;
815 	case IORING_REGISTER_MEM_REGION:
816 		ret = -EINVAL;
817 		if (!arg || nr_args != 1)
818 			break;
819 		ret = io_register_mem_region(ctx, arg);
820 		break;
821 	default:
822 		ret = -EINVAL;
823 		break;
824 	}
825 
826 	return ret;
827 }
828 
829 /*
830  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
831  * true, then the registered index is used. Otherwise, the normal fd table.
832  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
833  */
834 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
835 {
836 	struct file *file;
837 
838 	if (registered) {
839 		/*
840 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
841 		 * need only dereference our task private array to find it.
842 		 */
843 		struct io_uring_task *tctx = current->io_uring;
844 
845 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
846 			return ERR_PTR(-EINVAL);
847 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
848 		file = tctx->registered_rings[fd];
849 	} else {
850 		file = fget(fd);
851 	}
852 
853 	if (unlikely(!file))
854 		return ERR_PTR(-EBADF);
855 	if (io_is_uring_fops(file))
856 		return file;
857 	fput(file);
858 	return ERR_PTR(-EOPNOTSUPP);
859 }
860 
861 /*
862  * "blind" registration opcodes are ones where there's no ring given, and
863  * hence the source fd must be -1.
864  */
865 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
866 				   unsigned int nr_args)
867 {
868 	switch (opcode) {
869 	case IORING_REGISTER_SEND_MSG_RING: {
870 		struct io_uring_sqe sqe;
871 
872 		if (!arg || nr_args != 1)
873 			return -EINVAL;
874 		if (copy_from_user(&sqe, arg, sizeof(sqe)))
875 			return -EFAULT;
876 		/* no flags supported */
877 		if (sqe.flags)
878 			return -EINVAL;
879 		if (sqe.opcode == IORING_OP_MSG_RING)
880 			return io_uring_sync_msg_ring(&sqe);
881 		}
882 	}
883 
884 	return -EINVAL;
885 }
886 
887 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
888 		void __user *, arg, unsigned int, nr_args)
889 {
890 	struct io_ring_ctx *ctx;
891 	long ret = -EBADF;
892 	struct file *file;
893 	bool use_registered_ring;
894 
895 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
896 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
897 
898 	if (opcode >= IORING_REGISTER_LAST)
899 		return -EINVAL;
900 
901 	if (fd == -1)
902 		return io_uring_register_blind(opcode, arg, nr_args);
903 
904 	file = io_uring_register_get_file(fd, use_registered_ring);
905 	if (IS_ERR(file))
906 		return PTR_ERR(file);
907 	ctx = file->private_data;
908 
909 	mutex_lock(&ctx->uring_lock);
910 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
911 
912 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
913 				ctx->buf_table.nr, ret);
914 	mutex_unlock(&ctx->uring_lock);
915 	if (!use_registered_ring)
916 		fput(file);
917 	return ret;
918 }
919