xref: /linux/io_uring/register.c (revision 5832d26433f2bd0d28f8b12526e3c2fdb203507f)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "filetable.h"
22 #include "io_uring.h"
23 #include "opdef.h"
24 #include "tctx.h"
25 #include "rsrc.h"
26 #include "sqpoll.h"
27 #include "register.h"
28 #include "cancel.h"
29 #include "kbuf.h"
30 #include "napi.h"
31 #include "eventfd.h"
32 #include "msg_ring.h"
33 #include "memmap.h"
34 #include "zcrx.h"
35 #include "query.h"
36 
37 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
38 				 IORING_REGISTER_LAST + IORING_OP_LAST)
39 
40 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
41 			   unsigned nr_args)
42 {
43 	struct io_uring_probe *p;
44 	size_t size;
45 	int i, ret;
46 
47 	if (nr_args > IORING_OP_LAST)
48 		nr_args = IORING_OP_LAST;
49 
50 	size = struct_size(p, ops, nr_args);
51 	p = memdup_user(arg, size);
52 	if (IS_ERR(p))
53 		return PTR_ERR(p);
54 	ret = -EINVAL;
55 	if (memchr_inv(p, 0, size))
56 		goto out;
57 
58 	p->last_op = IORING_OP_LAST - 1;
59 
60 	for (i = 0; i < nr_args; i++) {
61 		p->ops[i].op = i;
62 		if (io_uring_op_supported(i))
63 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
64 	}
65 	p->ops_len = i;
66 
67 	ret = 0;
68 	if (copy_to_user(arg, p, size))
69 		ret = -EFAULT;
70 out:
71 	kfree(p);
72 	return ret;
73 }
74 
75 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
76 {
77 	const struct cred *creds;
78 
79 	creds = xa_erase(&ctx->personalities, id);
80 	if (creds) {
81 		put_cred(creds);
82 		return 0;
83 	}
84 
85 	return -EINVAL;
86 }
87 
88 
89 static int io_register_personality(struct io_ring_ctx *ctx)
90 {
91 	const struct cred *creds;
92 	u32 id;
93 	int ret;
94 
95 	creds = get_current_cred();
96 
97 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
98 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
99 	if (ret < 0) {
100 		put_cred(creds);
101 		return ret;
102 	}
103 	return id;
104 }
105 
106 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
107 					struct io_restriction *restrictions)
108 {
109 	struct io_uring_restriction *res;
110 	size_t size;
111 	int i, ret;
112 
113 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
114 		return -EINVAL;
115 
116 	size = array_size(nr_args, sizeof(*res));
117 	if (size == SIZE_MAX)
118 		return -EOVERFLOW;
119 
120 	res = memdup_user(arg, size);
121 	if (IS_ERR(res))
122 		return PTR_ERR(res);
123 
124 	ret = -EINVAL;
125 
126 	for (i = 0; i < nr_args; i++) {
127 		switch (res[i].opcode) {
128 		case IORING_RESTRICTION_REGISTER_OP:
129 			if (res[i].register_op >= IORING_REGISTER_LAST)
130 				goto err;
131 			__set_bit(res[i].register_op, restrictions->register_op);
132 			break;
133 		case IORING_RESTRICTION_SQE_OP:
134 			if (res[i].sqe_op >= IORING_OP_LAST)
135 				goto err;
136 			__set_bit(res[i].sqe_op, restrictions->sqe_op);
137 			break;
138 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
139 			restrictions->sqe_flags_allowed = res[i].sqe_flags;
140 			break;
141 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
142 			restrictions->sqe_flags_required = res[i].sqe_flags;
143 			break;
144 		default:
145 			goto err;
146 		}
147 	}
148 
149 	ret = 0;
150 
151 err:
152 	kfree(res);
153 	return ret;
154 }
155 
156 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
157 					   void __user *arg, unsigned int nr_args)
158 {
159 	int ret;
160 
161 	/* Restrictions allowed only if rings started disabled */
162 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
163 		return -EBADFD;
164 
165 	/* We allow only a single restrictions registration */
166 	if (ctx->restrictions.registered)
167 		return -EBUSY;
168 
169 	ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
170 	/* Reset all restrictions if an error happened */
171 	if (ret != 0)
172 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
173 	else
174 		ctx->restrictions.registered = true;
175 	return ret;
176 }
177 
178 static int io_register_enable_rings(struct io_ring_ctx *ctx)
179 {
180 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
181 		return -EBADFD;
182 
183 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
184 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
185 		/*
186 		 * Lazy activation attempts would fail if it was polled before
187 		 * submitter_task is set.
188 		 */
189 		if (wq_has_sleeper(&ctx->poll_wq))
190 			io_activate_pollwq(ctx);
191 	}
192 
193 	if (ctx->restrictions.registered)
194 		ctx->restricted = 1;
195 
196 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
197 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
198 		wake_up(&ctx->sq_data->wait);
199 	return 0;
200 }
201 
202 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
203 					 cpumask_var_t new_mask)
204 {
205 	int ret;
206 
207 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
208 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
209 	} else {
210 		mutex_unlock(&ctx->uring_lock);
211 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
212 		mutex_lock(&ctx->uring_lock);
213 	}
214 
215 	return ret;
216 }
217 
218 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
219 				       void __user *arg, unsigned len)
220 {
221 	cpumask_var_t new_mask;
222 	int ret;
223 
224 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
225 		return -ENOMEM;
226 
227 	cpumask_clear(new_mask);
228 	if (len > cpumask_size())
229 		len = cpumask_size();
230 
231 #ifdef CONFIG_COMPAT
232 	if (in_compat_syscall())
233 		ret = compat_get_bitmap(cpumask_bits(new_mask),
234 					(const compat_ulong_t __user *)arg,
235 					len * 8 /* CHAR_BIT */);
236 	else
237 #endif
238 		ret = copy_from_user(new_mask, arg, len);
239 
240 	if (ret) {
241 		free_cpumask_var(new_mask);
242 		return -EFAULT;
243 	}
244 
245 	ret = __io_register_iowq_aff(ctx, new_mask);
246 	free_cpumask_var(new_mask);
247 	return ret;
248 }
249 
250 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
251 {
252 	return __io_register_iowq_aff(ctx, NULL);
253 }
254 
255 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
256 					       void __user *arg)
257 	__must_hold(&ctx->uring_lock)
258 {
259 	struct io_tctx_node *node;
260 	struct io_uring_task *tctx = NULL;
261 	struct io_sq_data *sqd = NULL;
262 	__u32 new_count[2];
263 	int i, ret;
264 
265 	if (copy_from_user(new_count, arg, sizeof(new_count)))
266 		return -EFAULT;
267 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
268 		if (new_count[i] > INT_MAX)
269 			return -EINVAL;
270 
271 	if (ctx->flags & IORING_SETUP_SQPOLL) {
272 		sqd = ctx->sq_data;
273 		if (sqd) {
274 			struct task_struct *tsk;
275 
276 			/*
277 			 * Observe the correct sqd->lock -> ctx->uring_lock
278 			 * ordering. Fine to drop uring_lock here, we hold
279 			 * a ref to the ctx.
280 			 */
281 			refcount_inc(&sqd->refs);
282 			mutex_unlock(&ctx->uring_lock);
283 			mutex_lock(&sqd->lock);
284 			mutex_lock(&ctx->uring_lock);
285 			tsk = sqpoll_task_locked(sqd);
286 			if (tsk)
287 				tctx = tsk->io_uring;
288 		}
289 	} else {
290 		tctx = current->io_uring;
291 	}
292 
293 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
294 
295 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
296 		if (new_count[i])
297 			ctx->iowq_limits[i] = new_count[i];
298 	ctx->iowq_limits_set = true;
299 
300 	if (tctx && tctx->io_wq) {
301 		ret = io_wq_max_workers(tctx->io_wq, new_count);
302 		if (ret)
303 			goto err;
304 	} else {
305 		memset(new_count, 0, sizeof(new_count));
306 	}
307 
308 	if (sqd) {
309 		mutex_unlock(&ctx->uring_lock);
310 		mutex_unlock(&sqd->lock);
311 		io_put_sq_data(sqd);
312 		mutex_lock(&ctx->uring_lock);
313 	}
314 
315 	if (copy_to_user(arg, new_count, sizeof(new_count)))
316 		return -EFAULT;
317 
318 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
319 	if (sqd)
320 		return 0;
321 
322 	/* now propagate the restriction to all registered users */
323 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
324 		tctx = node->task->io_uring;
325 		if (WARN_ON_ONCE(!tctx->io_wq))
326 			continue;
327 
328 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
329 			new_count[i] = ctx->iowq_limits[i];
330 		/* ignore errors, it always returns zero anyway */
331 		(void)io_wq_max_workers(tctx->io_wq, new_count);
332 	}
333 	return 0;
334 err:
335 	if (sqd) {
336 		mutex_unlock(&ctx->uring_lock);
337 		mutex_unlock(&sqd->lock);
338 		io_put_sq_data(sqd);
339 		mutex_lock(&ctx->uring_lock);
340 	}
341 	return ret;
342 }
343 
344 static int io_register_clock(struct io_ring_ctx *ctx,
345 			     struct io_uring_clock_register __user *arg)
346 {
347 	struct io_uring_clock_register reg;
348 
349 	if (copy_from_user(&reg, arg, sizeof(reg)))
350 		return -EFAULT;
351 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
352 		return -EINVAL;
353 
354 	switch (reg.clockid) {
355 	case CLOCK_MONOTONIC:
356 		ctx->clock_offset = 0;
357 		break;
358 	case CLOCK_BOOTTIME:
359 		ctx->clock_offset = TK_OFFS_BOOT;
360 		break;
361 	default:
362 		return -EINVAL;
363 	}
364 
365 	ctx->clockid = reg.clockid;
366 	return 0;
367 }
368 
369 /*
370  * State to maintain until we can swap. Both new and old state, used for
371  * either mapping or freeing.
372  */
373 struct io_ring_ctx_rings {
374 	struct io_rings *rings;
375 	struct io_uring_sqe *sq_sqes;
376 
377 	struct io_mapped_region sq_region;
378 	struct io_mapped_region ring_region;
379 };
380 
381 static void io_register_free_rings(struct io_ring_ctx *ctx,
382 				   struct io_uring_params *p,
383 				   struct io_ring_ctx_rings *r)
384 {
385 	io_free_region(ctx, &r->sq_region);
386 	io_free_region(ctx, &r->ring_region);
387 }
388 
389 #define swap_old(ctx, o, n, field)		\
390 	do {					\
391 		(o).field = (ctx)->field;	\
392 		(ctx)->field = (n).field;	\
393 	} while (0)
394 
395 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
396 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
397 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
398 			 IORING_SETUP_CQE_MIXED)
399 
400 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
401 {
402 	struct io_uring_region_desc rd;
403 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
404 	size_t size, sq_array_offset;
405 	unsigned i, tail, old_head;
406 	struct io_uring_params p;
407 	int ret;
408 
409 	/* limited to DEFER_TASKRUN for now */
410 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
411 		return -EINVAL;
412 	if (copy_from_user(&p, arg, sizeof(p)))
413 		return -EFAULT;
414 	if (p.flags & ~RESIZE_FLAGS)
415 		return -EINVAL;
416 
417 	/* properties that are always inherited */
418 	p.flags |= (ctx->flags & COPY_FLAGS);
419 
420 	ret = io_uring_fill_params(p.sq_entries, &p);
421 	if (unlikely(ret))
422 		return ret;
423 
424 	/* nothing to do, but copy params back */
425 	if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
426 		if (copy_to_user(arg, &p, sizeof(p)))
427 			return -EFAULT;
428 		return 0;
429 	}
430 
431 	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
432 				&sq_array_offset);
433 	if (size == SIZE_MAX)
434 		return -EOVERFLOW;
435 
436 	memset(&rd, 0, sizeof(rd));
437 	rd.size = PAGE_ALIGN(size);
438 	if (p.flags & IORING_SETUP_NO_MMAP) {
439 		rd.user_addr = p.cq_off.user_addr;
440 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
441 	}
442 	ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
443 	if (ret) {
444 		io_register_free_rings(ctx, &p, &n);
445 		return ret;
446 	}
447 	n.rings = io_region_get_ptr(&n.ring_region);
448 
449 	/*
450 	 * At this point n.rings is shared with userspace, just like o.rings
451 	 * is as well. While we don't expect userspace to modify it while
452 	 * a resize is in progress, and it's most likely that userspace will
453 	 * shoot itself in the foot if it does, we can't always assume good
454 	 * intent... Use read/write once helpers from here on to indicate the
455 	 * shared nature of it.
456 	 */
457 	WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
458 	WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
459 	WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
460 	WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
461 
462 	if (copy_to_user(arg, &p, sizeof(p))) {
463 		io_register_free_rings(ctx, &p, &n);
464 		return -EFAULT;
465 	}
466 
467 	if (p.flags & IORING_SETUP_SQE128)
468 		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
469 	else
470 		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
471 	if (size == SIZE_MAX) {
472 		io_register_free_rings(ctx, &p, &n);
473 		return -EOVERFLOW;
474 	}
475 
476 	memset(&rd, 0, sizeof(rd));
477 	rd.size = PAGE_ALIGN(size);
478 	if (p.flags & IORING_SETUP_NO_MMAP) {
479 		rd.user_addr = p.sq_off.user_addr;
480 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
481 	}
482 	ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
483 	if (ret) {
484 		io_register_free_rings(ctx, &p, &n);
485 		return ret;
486 	}
487 	n.sq_sqes = io_region_get_ptr(&n.sq_region);
488 
489 	/*
490 	 * If using SQPOLL, park the thread
491 	 */
492 	if (ctx->sq_data) {
493 		mutex_unlock(&ctx->uring_lock);
494 		io_sq_thread_park(ctx->sq_data);
495 		mutex_lock(&ctx->uring_lock);
496 	}
497 
498 	/*
499 	 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
500 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
501 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
502 	 * existing rings beyond this point will fail. Not that it could proceed
503 	 * at this point anyway, as the io_uring mmap side needs go grab the
504 	 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
505 	 * duration of the actual swap.
506 	 */
507 	mutex_lock(&ctx->mmap_lock);
508 	spin_lock(&ctx->completion_lock);
509 	o.rings = ctx->rings;
510 	ctx->rings = NULL;
511 	o.sq_sqes = ctx->sq_sqes;
512 	ctx->sq_sqes = NULL;
513 
514 	/*
515 	 * Now copy SQ and CQ entries, if any. If either of the destination
516 	 * rings can't hold what is already there, then fail the operation.
517 	 */
518 	tail = READ_ONCE(o.rings->sq.tail);
519 	old_head = READ_ONCE(o.rings->sq.head);
520 	if (tail - old_head > p.sq_entries)
521 		goto overflow;
522 	for (i = old_head; i < tail; i++) {
523 		unsigned src_head = i & (ctx->sq_entries - 1);
524 		unsigned dst_head = i & (p.sq_entries - 1);
525 
526 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
527 	}
528 	WRITE_ONCE(n.rings->sq.head, old_head);
529 	WRITE_ONCE(n.rings->sq.tail, tail);
530 
531 	tail = READ_ONCE(o.rings->cq.tail);
532 	old_head = READ_ONCE(o.rings->cq.head);
533 	if (tail - old_head > p.cq_entries) {
534 overflow:
535 		/* restore old rings, and return -EOVERFLOW via cleanup path */
536 		ctx->rings = o.rings;
537 		ctx->sq_sqes = o.sq_sqes;
538 		to_free = &n;
539 		ret = -EOVERFLOW;
540 		goto out;
541 	}
542 	for (i = old_head; i < tail; i++) {
543 		unsigned src_head = i & (ctx->cq_entries - 1);
544 		unsigned dst_head = i & (p.cq_entries - 1);
545 
546 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
547 	}
548 	WRITE_ONCE(n.rings->cq.head, old_head);
549 	WRITE_ONCE(n.rings->cq.tail, tail);
550 	/* invalidate cached cqe refill */
551 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
552 
553 	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
554 	atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
555 	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
556 	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
557 
558 	/* all done, store old pointers and assign new ones */
559 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
560 		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
561 
562 	ctx->sq_entries = p.sq_entries;
563 	ctx->cq_entries = p.cq_entries;
564 
565 	ctx->rings = n.rings;
566 	ctx->sq_sqes = n.sq_sqes;
567 	swap_old(ctx, o, n, ring_region);
568 	swap_old(ctx, o, n, sq_region);
569 	to_free = &o;
570 	ret = 0;
571 out:
572 	spin_unlock(&ctx->completion_lock);
573 	mutex_unlock(&ctx->mmap_lock);
574 	io_register_free_rings(ctx, &p, to_free);
575 
576 	if (ctx->sq_data)
577 		io_sq_thread_unpark(ctx->sq_data);
578 
579 	return ret;
580 }
581 
582 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
583 {
584 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
585 	struct io_uring_mem_region_reg reg;
586 	struct io_uring_region_desc __user *rd_uptr;
587 	struct io_uring_region_desc rd;
588 	int ret;
589 
590 	if (io_region_is_set(&ctx->param_region))
591 		return -EBUSY;
592 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
593 		return -EFAULT;
594 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
595 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
596 		return -EFAULT;
597 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
598 		return -EINVAL;
599 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
600 		return -EINVAL;
601 
602 	/*
603 	 * This ensures there are no waiters. Waiters are unlocked and it's
604 	 * hard to synchronise with them, especially if we need to initialise
605 	 * the region.
606 	 */
607 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
608 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
609 		return -EINVAL;
610 
611 	ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd,
612 					 IORING_MAP_OFF_PARAM_REGION);
613 	if (ret)
614 		return ret;
615 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
616 		io_free_region(ctx, &ctx->param_region);
617 		return -EFAULT;
618 	}
619 
620 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
621 		ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
622 		ctx->cq_wait_size = rd.size;
623 	}
624 	return 0;
625 }
626 
627 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
628 			       void __user *arg, unsigned nr_args)
629 	__releases(ctx->uring_lock)
630 	__acquires(ctx->uring_lock)
631 {
632 	int ret;
633 
634 	/*
635 	 * We don't quiesce the refs for register anymore and so it can't be
636 	 * dying as we're holding a file ref here.
637 	 */
638 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
639 		return -ENXIO;
640 
641 	if (ctx->submitter_task && ctx->submitter_task != current)
642 		return -EEXIST;
643 
644 	if (ctx->restricted) {
645 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
646 		if (!test_bit(opcode, ctx->restrictions.register_op))
647 			return -EACCES;
648 	}
649 
650 	switch (opcode) {
651 	case IORING_REGISTER_BUFFERS:
652 		ret = -EFAULT;
653 		if (!arg)
654 			break;
655 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
656 		break;
657 	case IORING_UNREGISTER_BUFFERS:
658 		ret = -EINVAL;
659 		if (arg || nr_args)
660 			break;
661 		ret = io_sqe_buffers_unregister(ctx);
662 		break;
663 	case IORING_REGISTER_FILES:
664 		ret = -EFAULT;
665 		if (!arg)
666 			break;
667 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
668 		break;
669 	case IORING_UNREGISTER_FILES:
670 		ret = -EINVAL;
671 		if (arg || nr_args)
672 			break;
673 		ret = io_sqe_files_unregister(ctx);
674 		break;
675 	case IORING_REGISTER_FILES_UPDATE:
676 		ret = io_register_files_update(ctx, arg, nr_args);
677 		break;
678 	case IORING_REGISTER_EVENTFD:
679 		ret = -EINVAL;
680 		if (nr_args != 1)
681 			break;
682 		ret = io_eventfd_register(ctx, arg, 0);
683 		break;
684 	case IORING_REGISTER_EVENTFD_ASYNC:
685 		ret = -EINVAL;
686 		if (nr_args != 1)
687 			break;
688 		ret = io_eventfd_register(ctx, arg, 1);
689 		break;
690 	case IORING_UNREGISTER_EVENTFD:
691 		ret = -EINVAL;
692 		if (arg || nr_args)
693 			break;
694 		ret = io_eventfd_unregister(ctx);
695 		break;
696 	case IORING_REGISTER_PROBE:
697 		ret = -EINVAL;
698 		if (!arg || nr_args > 256)
699 			break;
700 		ret = io_probe(ctx, arg, nr_args);
701 		break;
702 	case IORING_REGISTER_PERSONALITY:
703 		ret = -EINVAL;
704 		if (arg || nr_args)
705 			break;
706 		ret = io_register_personality(ctx);
707 		break;
708 	case IORING_UNREGISTER_PERSONALITY:
709 		ret = -EINVAL;
710 		if (arg)
711 			break;
712 		ret = io_unregister_personality(ctx, nr_args);
713 		break;
714 	case IORING_REGISTER_ENABLE_RINGS:
715 		ret = -EINVAL;
716 		if (arg || nr_args)
717 			break;
718 		ret = io_register_enable_rings(ctx);
719 		break;
720 	case IORING_REGISTER_RESTRICTIONS:
721 		ret = io_register_restrictions(ctx, arg, nr_args);
722 		break;
723 	case IORING_REGISTER_FILES2:
724 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
725 		break;
726 	case IORING_REGISTER_FILES_UPDATE2:
727 		ret = io_register_rsrc_update(ctx, arg, nr_args,
728 					      IORING_RSRC_FILE);
729 		break;
730 	case IORING_REGISTER_BUFFERS2:
731 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
732 		break;
733 	case IORING_REGISTER_BUFFERS_UPDATE:
734 		ret = io_register_rsrc_update(ctx, arg, nr_args,
735 					      IORING_RSRC_BUFFER);
736 		break;
737 	case IORING_REGISTER_IOWQ_AFF:
738 		ret = -EINVAL;
739 		if (!arg || !nr_args)
740 			break;
741 		ret = io_register_iowq_aff(ctx, arg, nr_args);
742 		break;
743 	case IORING_UNREGISTER_IOWQ_AFF:
744 		ret = -EINVAL;
745 		if (arg || nr_args)
746 			break;
747 		ret = io_unregister_iowq_aff(ctx);
748 		break;
749 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
750 		ret = -EINVAL;
751 		if (!arg || nr_args != 2)
752 			break;
753 		ret = io_register_iowq_max_workers(ctx, arg);
754 		break;
755 	case IORING_REGISTER_RING_FDS:
756 		ret = io_ringfd_register(ctx, arg, nr_args);
757 		break;
758 	case IORING_UNREGISTER_RING_FDS:
759 		ret = io_ringfd_unregister(ctx, arg, nr_args);
760 		break;
761 	case IORING_REGISTER_PBUF_RING:
762 		ret = -EINVAL;
763 		if (!arg || nr_args != 1)
764 			break;
765 		ret = io_register_pbuf_ring(ctx, arg);
766 		break;
767 	case IORING_UNREGISTER_PBUF_RING:
768 		ret = -EINVAL;
769 		if (!arg || nr_args != 1)
770 			break;
771 		ret = io_unregister_pbuf_ring(ctx, arg);
772 		break;
773 	case IORING_REGISTER_SYNC_CANCEL:
774 		ret = -EINVAL;
775 		if (!arg || nr_args != 1)
776 			break;
777 		ret = io_sync_cancel(ctx, arg);
778 		break;
779 	case IORING_REGISTER_FILE_ALLOC_RANGE:
780 		ret = -EINVAL;
781 		if (!arg || nr_args)
782 			break;
783 		ret = io_register_file_alloc_range(ctx, arg);
784 		break;
785 	case IORING_REGISTER_PBUF_STATUS:
786 		ret = -EINVAL;
787 		if (!arg || nr_args != 1)
788 			break;
789 		ret = io_register_pbuf_status(ctx, arg);
790 		break;
791 	case IORING_REGISTER_NAPI:
792 		ret = -EINVAL;
793 		if (!arg || nr_args != 1)
794 			break;
795 		ret = io_register_napi(ctx, arg);
796 		break;
797 	case IORING_UNREGISTER_NAPI:
798 		ret = -EINVAL;
799 		if (nr_args != 1)
800 			break;
801 		ret = io_unregister_napi(ctx, arg);
802 		break;
803 	case IORING_REGISTER_CLOCK:
804 		ret = -EINVAL;
805 		if (!arg || nr_args)
806 			break;
807 		ret = io_register_clock(ctx, arg);
808 		break;
809 	case IORING_REGISTER_CLONE_BUFFERS:
810 		ret = -EINVAL;
811 		if (!arg || nr_args != 1)
812 			break;
813 		ret = io_register_clone_buffers(ctx, arg);
814 		break;
815 	case IORING_REGISTER_ZCRX_IFQ:
816 		ret = -EINVAL;
817 		if (!arg || nr_args != 1)
818 			break;
819 		ret = io_register_zcrx_ifq(ctx, arg);
820 		break;
821 	case IORING_REGISTER_RESIZE_RINGS:
822 		ret = -EINVAL;
823 		if (!arg || nr_args != 1)
824 			break;
825 		ret = io_register_resize_rings(ctx, arg);
826 		break;
827 	case IORING_REGISTER_MEM_REGION:
828 		ret = -EINVAL;
829 		if (!arg || nr_args != 1)
830 			break;
831 		ret = io_register_mem_region(ctx, arg);
832 		break;
833 	case IORING_REGISTER_QUERY:
834 		ret = io_query(ctx, arg, nr_args);
835 		break;
836 	case IORING_REGISTER_ZCRX_REFILL:
837 		ret = io_zcrx_return_bufs(ctx, arg, nr_args);
838 		break;
839 	default:
840 		ret = -EINVAL;
841 		break;
842 	}
843 
844 	return ret;
845 }
846 
847 /*
848  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
849  * true, then the registered index is used. Otherwise, the normal fd table.
850  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
851  */
852 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
853 {
854 	struct file *file;
855 
856 	if (registered) {
857 		/*
858 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
859 		 * need only dereference our task private array to find it.
860 		 */
861 		struct io_uring_task *tctx = current->io_uring;
862 
863 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
864 			return ERR_PTR(-EINVAL);
865 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
866 		file = tctx->registered_rings[fd];
867 		if (file)
868 			get_file(file);
869 	} else {
870 		file = fget(fd);
871 	}
872 
873 	if (unlikely(!file))
874 		return ERR_PTR(-EBADF);
875 	if (io_is_uring_fops(file))
876 		return file;
877 	fput(file);
878 	return ERR_PTR(-EOPNOTSUPP);
879 }
880 
881 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
882 {
883 	struct io_uring_sqe sqe;
884 
885 	if (!arg || nr_args != 1)
886 		return -EINVAL;
887 	if (copy_from_user(&sqe, arg, sizeof(sqe)))
888 		return -EFAULT;
889 	/* no flags supported */
890 	if (sqe.flags)
891 		return -EINVAL;
892 	if (sqe.opcode != IORING_OP_MSG_RING)
893 		return -EINVAL;
894 
895 	return io_uring_sync_msg_ring(&sqe);
896 }
897 
898 /*
899  * "blind" registration opcodes are ones where there's no ring given, and
900  * hence the source fd must be -1.
901  */
902 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
903 				   unsigned int nr_args)
904 {
905 	switch (opcode) {
906 	case IORING_REGISTER_SEND_MSG_RING:
907 		return io_uring_register_send_msg_ring(arg, nr_args);
908 	case IORING_REGISTER_QUERY:
909 		return io_query(NULL, arg, nr_args);
910 	}
911 	return -EINVAL;
912 }
913 
914 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
915 		void __user *, arg, unsigned int, nr_args)
916 {
917 	struct io_ring_ctx *ctx;
918 	long ret = -EBADF;
919 	struct file *file;
920 	bool use_registered_ring;
921 
922 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
923 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
924 
925 	if (opcode >= IORING_REGISTER_LAST)
926 		return -EINVAL;
927 
928 	if (fd == -1)
929 		return io_uring_register_blind(opcode, arg, nr_args);
930 
931 	file = io_uring_register_get_file(fd, use_registered_ring);
932 	if (IS_ERR(file))
933 		return PTR_ERR(file);
934 	ctx = file->private_data;
935 
936 	mutex_lock(&ctx->uring_lock);
937 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
938 
939 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
940 				ctx->buf_table.nr, ret);
941 	mutex_unlock(&ctx->uring_lock);
942 
943 	fput(file);
944 	return ret;
945 }
946