xref: /linux/io_uring/register.c (revision 5623eb1ed035f01dfa620366a82b667545b10c82)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "filetable.h"
22 #include "io_uring.h"
23 #include "opdef.h"
24 #include "tctx.h"
25 #include "rsrc.h"
26 #include "sqpoll.h"
27 #include "register.h"
28 #include "cancel.h"
29 #include "kbuf.h"
30 #include "napi.h"
31 #include "eventfd.h"
32 #include "msg_ring.h"
33 #include "memmap.h"
34 #include "zcrx.h"
35 #include "query.h"
36 
37 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
38 				 IORING_REGISTER_LAST + IORING_OP_LAST)
39 
40 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
41 			   unsigned nr_args)
42 {
43 	struct io_uring_probe *p;
44 	size_t size;
45 	int i, ret;
46 
47 	if (nr_args > IORING_OP_LAST)
48 		nr_args = IORING_OP_LAST;
49 
50 	size = struct_size(p, ops, nr_args);
51 	p = memdup_user(arg, size);
52 	if (IS_ERR(p))
53 		return PTR_ERR(p);
54 	ret = -EINVAL;
55 	if (memchr_inv(p, 0, size))
56 		goto out;
57 
58 	p->last_op = IORING_OP_LAST - 1;
59 
60 	for (i = 0; i < nr_args; i++) {
61 		p->ops[i].op = i;
62 		if (io_uring_op_supported(i))
63 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
64 	}
65 	p->ops_len = i;
66 
67 	ret = 0;
68 	if (copy_to_user(arg, p, size))
69 		ret = -EFAULT;
70 out:
71 	kfree(p);
72 	return ret;
73 }
74 
75 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
76 {
77 	const struct cred *creds;
78 
79 	creds = xa_erase(&ctx->personalities, id);
80 	if (creds) {
81 		put_cred(creds);
82 		return 0;
83 	}
84 
85 	return -EINVAL;
86 }
87 
88 
89 static int io_register_personality(struct io_ring_ctx *ctx)
90 {
91 	const struct cred *creds;
92 	u32 id;
93 	int ret;
94 
95 	creds = get_current_cred();
96 
97 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
98 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
99 	if (ret < 0) {
100 		put_cred(creds);
101 		return ret;
102 	}
103 	return id;
104 }
105 
106 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
107 					struct io_restriction *restrictions)
108 {
109 	struct io_uring_restriction *res;
110 	size_t size;
111 	int i, ret;
112 
113 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
114 		return -EINVAL;
115 
116 	size = array_size(nr_args, sizeof(*res));
117 	if (size == SIZE_MAX)
118 		return -EOVERFLOW;
119 
120 	res = memdup_user(arg, size);
121 	if (IS_ERR(res))
122 		return PTR_ERR(res);
123 
124 	ret = -EINVAL;
125 
126 	for (i = 0; i < nr_args; i++) {
127 		switch (res[i].opcode) {
128 		case IORING_RESTRICTION_REGISTER_OP:
129 			if (res[i].register_op >= IORING_REGISTER_LAST)
130 				goto err;
131 			__set_bit(res[i].register_op, restrictions->register_op);
132 			break;
133 		case IORING_RESTRICTION_SQE_OP:
134 			if (res[i].sqe_op >= IORING_OP_LAST)
135 				goto err;
136 			__set_bit(res[i].sqe_op, restrictions->sqe_op);
137 			break;
138 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
139 			restrictions->sqe_flags_allowed = res[i].sqe_flags;
140 			break;
141 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
142 			restrictions->sqe_flags_required = res[i].sqe_flags;
143 			break;
144 		default:
145 			goto err;
146 		}
147 	}
148 
149 	ret = 0;
150 
151 err:
152 	kfree(res);
153 	return ret;
154 }
155 
156 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
157 					   void __user *arg, unsigned int nr_args)
158 {
159 	int ret;
160 
161 	/* Restrictions allowed only if rings started disabled */
162 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
163 		return -EBADFD;
164 
165 	/* We allow only a single restrictions registration */
166 	if (ctx->restrictions.registered)
167 		return -EBUSY;
168 
169 	ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
170 	/* Reset all restrictions if an error happened */
171 	if (ret != 0)
172 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
173 	else
174 		ctx->restrictions.registered = true;
175 	return ret;
176 }
177 
178 static int io_register_enable_rings(struct io_ring_ctx *ctx)
179 {
180 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
181 		return -EBADFD;
182 
183 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
184 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
185 		/*
186 		 * Lazy activation attempts would fail if it was polled before
187 		 * submitter_task is set.
188 		 */
189 		if (wq_has_sleeper(&ctx->poll_wq))
190 			io_activate_pollwq(ctx);
191 	}
192 
193 	if (ctx->restrictions.registered)
194 		ctx->restricted = 1;
195 
196 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
197 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
198 		wake_up(&ctx->sq_data->wait);
199 	return 0;
200 }
201 
202 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
203 					 cpumask_var_t new_mask)
204 {
205 	int ret;
206 
207 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
208 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
209 	} else {
210 		mutex_unlock(&ctx->uring_lock);
211 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
212 		mutex_lock(&ctx->uring_lock);
213 	}
214 
215 	return ret;
216 }
217 
218 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
219 				       void __user *arg, unsigned len)
220 {
221 	cpumask_var_t new_mask;
222 	int ret;
223 
224 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
225 		return -ENOMEM;
226 
227 	cpumask_clear(new_mask);
228 	if (len > cpumask_size())
229 		len = cpumask_size();
230 
231 #ifdef CONFIG_COMPAT
232 	if (in_compat_syscall())
233 		ret = compat_get_bitmap(cpumask_bits(new_mask),
234 					(const compat_ulong_t __user *)arg,
235 					len * 8 /* CHAR_BIT */);
236 	else
237 #endif
238 		ret = copy_from_user(new_mask, arg, len);
239 
240 	if (ret) {
241 		free_cpumask_var(new_mask);
242 		return -EFAULT;
243 	}
244 
245 	ret = __io_register_iowq_aff(ctx, new_mask);
246 	free_cpumask_var(new_mask);
247 	return ret;
248 }
249 
250 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
251 {
252 	return __io_register_iowq_aff(ctx, NULL);
253 }
254 
255 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
256 					       void __user *arg)
257 	__must_hold(&ctx->uring_lock)
258 {
259 	struct io_tctx_node *node;
260 	struct io_uring_task *tctx = NULL;
261 	struct io_sq_data *sqd = NULL;
262 	__u32 new_count[2];
263 	int i, ret;
264 
265 	if (copy_from_user(new_count, arg, sizeof(new_count)))
266 		return -EFAULT;
267 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
268 		if (new_count[i] > INT_MAX)
269 			return -EINVAL;
270 
271 	if (ctx->flags & IORING_SETUP_SQPOLL) {
272 		sqd = ctx->sq_data;
273 		if (sqd) {
274 			struct task_struct *tsk;
275 
276 			/*
277 			 * Observe the correct sqd->lock -> ctx->uring_lock
278 			 * ordering. Fine to drop uring_lock here, we hold
279 			 * a ref to the ctx.
280 			 */
281 			refcount_inc(&sqd->refs);
282 			mutex_unlock(&ctx->uring_lock);
283 			mutex_lock(&sqd->lock);
284 			mutex_lock(&ctx->uring_lock);
285 			tsk = sqpoll_task_locked(sqd);
286 			if (tsk)
287 				tctx = tsk->io_uring;
288 		}
289 	} else {
290 		tctx = current->io_uring;
291 	}
292 
293 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
294 
295 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
296 		if (new_count[i])
297 			ctx->iowq_limits[i] = new_count[i];
298 	ctx->iowq_limits_set = true;
299 
300 	if (tctx && tctx->io_wq) {
301 		ret = io_wq_max_workers(tctx->io_wq, new_count);
302 		if (ret)
303 			goto err;
304 	} else {
305 		memset(new_count, 0, sizeof(new_count));
306 	}
307 
308 	if (sqd) {
309 		mutex_unlock(&ctx->uring_lock);
310 		mutex_unlock(&sqd->lock);
311 		io_put_sq_data(sqd);
312 		mutex_lock(&ctx->uring_lock);
313 	}
314 
315 	if (copy_to_user(arg, new_count, sizeof(new_count)))
316 		return -EFAULT;
317 
318 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
319 	if (sqd)
320 		return 0;
321 
322 	/* now propagate the restriction to all registered users */
323 	mutex_lock(&ctx->tctx_lock);
324 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
325 		tctx = node->task->io_uring;
326 		if (WARN_ON_ONCE(!tctx->io_wq))
327 			continue;
328 
329 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
330 			new_count[i] = ctx->iowq_limits[i];
331 		/* ignore errors, it always returns zero anyway */
332 		(void)io_wq_max_workers(tctx->io_wq, new_count);
333 	}
334 	mutex_unlock(&ctx->tctx_lock);
335 	return 0;
336 err:
337 	if (sqd) {
338 		mutex_unlock(&ctx->uring_lock);
339 		mutex_unlock(&sqd->lock);
340 		io_put_sq_data(sqd);
341 		mutex_lock(&ctx->uring_lock);
342 	}
343 	return ret;
344 }
345 
346 static int io_register_clock(struct io_ring_ctx *ctx,
347 			     struct io_uring_clock_register __user *arg)
348 {
349 	struct io_uring_clock_register reg;
350 
351 	if (copy_from_user(&reg, arg, sizeof(reg)))
352 		return -EFAULT;
353 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
354 		return -EINVAL;
355 
356 	switch (reg.clockid) {
357 	case CLOCK_MONOTONIC:
358 		ctx->clock_offset = 0;
359 		break;
360 	case CLOCK_BOOTTIME:
361 		ctx->clock_offset = TK_OFFS_BOOT;
362 		break;
363 	default:
364 		return -EINVAL;
365 	}
366 
367 	ctx->clockid = reg.clockid;
368 	return 0;
369 }
370 
371 /*
372  * State to maintain until we can swap. Both new and old state, used for
373  * either mapping or freeing.
374  */
375 struct io_ring_ctx_rings {
376 	struct io_rings *rings;
377 	struct io_uring_sqe *sq_sqes;
378 
379 	struct io_mapped_region sq_region;
380 	struct io_mapped_region ring_region;
381 };
382 
383 static void io_register_free_rings(struct io_ring_ctx *ctx,
384 				   struct io_ring_ctx_rings *r)
385 {
386 	io_free_region(ctx->user, &r->sq_region);
387 	io_free_region(ctx->user, &r->ring_region);
388 }
389 
390 #define swap_old(ctx, o, n, field)		\
391 	do {					\
392 		(o).field = (ctx)->field;	\
393 		(ctx)->field = (n).field;	\
394 	} while (0)
395 
396 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
397 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
398 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
399 			 IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
400 
401 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
402 {
403 	struct io_ctx_config config;
404 	struct io_uring_region_desc rd;
405 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
406 	unsigned i, tail, old_head;
407 	struct io_uring_params *p = &config.p;
408 	struct io_rings_layout *rl = &config.layout;
409 	int ret;
410 
411 	memset(&config, 0, sizeof(config));
412 
413 	/* limited to DEFER_TASKRUN for now */
414 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
415 		return -EINVAL;
416 	if (copy_from_user(p, arg, sizeof(*p)))
417 		return -EFAULT;
418 	if (p->flags & ~RESIZE_FLAGS)
419 		return -EINVAL;
420 
421 	/* properties that are always inherited */
422 	p->flags |= (ctx->flags & COPY_FLAGS);
423 
424 	ret = io_prepare_config(&config);
425 	if (unlikely(ret))
426 		return ret;
427 
428 	memset(&rd, 0, sizeof(rd));
429 	rd.size = PAGE_ALIGN(rl->rings_size);
430 	if (p->flags & IORING_SETUP_NO_MMAP) {
431 		rd.user_addr = p->cq_off.user_addr;
432 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
433 	}
434 	ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
435 	if (ret)
436 		return ret;
437 
438 	n.rings = io_region_get_ptr(&n.ring_region);
439 
440 	/*
441 	 * At this point n.rings is shared with userspace, just like o.rings
442 	 * is as well. While we don't expect userspace to modify it while
443 	 * a resize is in progress, and it's most likely that userspace will
444 	 * shoot itself in the foot if it does, we can't always assume good
445 	 * intent... Use read/write once helpers from here on to indicate the
446 	 * shared nature of it.
447 	 */
448 	WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1);
449 	WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1);
450 	WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries);
451 	WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries);
452 
453 	if (copy_to_user(arg, p, sizeof(*p))) {
454 		io_register_free_rings(ctx, &n);
455 		return -EFAULT;
456 	}
457 
458 	memset(&rd, 0, sizeof(rd));
459 	rd.size = PAGE_ALIGN(rl->sq_size);
460 	if (p->flags & IORING_SETUP_NO_MMAP) {
461 		rd.user_addr = p->sq_off.user_addr;
462 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
463 	}
464 	ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
465 	if (ret) {
466 		io_register_free_rings(ctx, &n);
467 		return ret;
468 	}
469 	n.sq_sqes = io_region_get_ptr(&n.sq_region);
470 
471 	/*
472 	 * If using SQPOLL, park the thread
473 	 */
474 	if (ctx->sq_data) {
475 		mutex_unlock(&ctx->uring_lock);
476 		io_sq_thread_park(ctx->sq_data);
477 		mutex_lock(&ctx->uring_lock);
478 	}
479 
480 	/*
481 	 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
482 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
483 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
484 	 * existing rings beyond this point will fail. Not that it could proceed
485 	 * at this point anyway, as the io_uring mmap side needs go grab the
486 	 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
487 	 * duration of the actual swap.
488 	 */
489 	mutex_lock(&ctx->mmap_lock);
490 	spin_lock(&ctx->completion_lock);
491 	o.rings = ctx->rings;
492 	ctx->rings = NULL;
493 	o.sq_sqes = ctx->sq_sqes;
494 	ctx->sq_sqes = NULL;
495 
496 	/*
497 	 * Now copy SQ and CQ entries, if any. If either of the destination
498 	 * rings can't hold what is already there, then fail the operation.
499 	 */
500 	tail = READ_ONCE(o.rings->sq.tail);
501 	old_head = READ_ONCE(o.rings->sq.head);
502 	if (tail - old_head > p->sq_entries)
503 		goto overflow;
504 	for (i = old_head; i < tail; i++) {
505 		unsigned src_head = i & (ctx->sq_entries - 1);
506 		unsigned dst_head = i & (p->sq_entries - 1);
507 
508 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
509 	}
510 	WRITE_ONCE(n.rings->sq.head, old_head);
511 	WRITE_ONCE(n.rings->sq.tail, tail);
512 
513 	tail = READ_ONCE(o.rings->cq.tail);
514 	old_head = READ_ONCE(o.rings->cq.head);
515 	if (tail - old_head > p->cq_entries) {
516 overflow:
517 		/* restore old rings, and return -EOVERFLOW via cleanup path */
518 		ctx->rings = o.rings;
519 		ctx->sq_sqes = o.sq_sqes;
520 		to_free = &n;
521 		ret = -EOVERFLOW;
522 		goto out;
523 	}
524 	for (i = old_head; i < tail; i++) {
525 		unsigned src_head = i & (ctx->cq_entries - 1);
526 		unsigned dst_head = i & (p->cq_entries - 1);
527 
528 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
529 	}
530 	WRITE_ONCE(n.rings->cq.head, old_head);
531 	WRITE_ONCE(n.rings->cq.tail, tail);
532 	/* invalidate cached cqe refill */
533 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
534 
535 	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
536 	atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
537 	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
538 	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
539 
540 	/* all done, store old pointers and assign new ones */
541 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
542 		ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);
543 
544 	ctx->sq_entries = p->sq_entries;
545 	ctx->cq_entries = p->cq_entries;
546 
547 	ctx->rings = n.rings;
548 	ctx->sq_sqes = n.sq_sqes;
549 	swap_old(ctx, o, n, ring_region);
550 	swap_old(ctx, o, n, sq_region);
551 	to_free = &o;
552 	ret = 0;
553 out:
554 	spin_unlock(&ctx->completion_lock);
555 	mutex_unlock(&ctx->mmap_lock);
556 	io_register_free_rings(ctx, to_free);
557 
558 	if (ctx->sq_data)
559 		io_sq_thread_unpark(ctx->sq_data);
560 
561 	return ret;
562 }
563 
564 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
565 {
566 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
567 	struct io_uring_mem_region_reg reg;
568 	struct io_uring_region_desc __user *rd_uptr;
569 	struct io_uring_region_desc rd;
570 	struct io_mapped_region region = {};
571 	int ret;
572 
573 	if (io_region_is_set(&ctx->param_region))
574 		return -EBUSY;
575 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
576 		return -EFAULT;
577 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
578 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
579 		return -EFAULT;
580 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
581 		return -EINVAL;
582 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
583 		return -EINVAL;
584 
585 	/*
586 	 * This ensures there are no waiters. Waiters are unlocked and it's
587 	 * hard to synchronise with them, especially if we need to initialise
588 	 * the region.
589 	 */
590 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
591 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
592 		return -EINVAL;
593 
594 	ret = io_create_region(ctx, &region, &rd, IORING_MAP_OFF_PARAM_REGION);
595 	if (ret)
596 		return ret;
597 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
598 		io_free_region(ctx->user, &region);
599 		return -EFAULT;
600 	}
601 
602 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
603 		ctx->cq_wait_arg = io_region_get_ptr(&region);
604 		ctx->cq_wait_size = rd.size;
605 	}
606 
607 	io_region_publish(ctx, &region, &ctx->param_region);
608 	return 0;
609 }
610 
611 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
612 			       void __user *arg, unsigned nr_args)
613 	__releases(ctx->uring_lock)
614 	__acquires(ctx->uring_lock)
615 {
616 	int ret;
617 
618 	/*
619 	 * We don't quiesce the refs for register anymore and so it can't be
620 	 * dying as we're holding a file ref here.
621 	 */
622 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
623 		return -ENXIO;
624 
625 	if (ctx->submitter_task && ctx->submitter_task != current)
626 		return -EEXIST;
627 
628 	if (ctx->restricted) {
629 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
630 		if (!test_bit(opcode, ctx->restrictions.register_op))
631 			return -EACCES;
632 	}
633 
634 	switch (opcode) {
635 	case IORING_REGISTER_BUFFERS:
636 		ret = -EFAULT;
637 		if (!arg)
638 			break;
639 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
640 		break;
641 	case IORING_UNREGISTER_BUFFERS:
642 		ret = -EINVAL;
643 		if (arg || nr_args)
644 			break;
645 		ret = io_sqe_buffers_unregister(ctx);
646 		break;
647 	case IORING_REGISTER_FILES:
648 		ret = -EFAULT;
649 		if (!arg)
650 			break;
651 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
652 		break;
653 	case IORING_UNREGISTER_FILES:
654 		ret = -EINVAL;
655 		if (arg || nr_args)
656 			break;
657 		ret = io_sqe_files_unregister(ctx);
658 		break;
659 	case IORING_REGISTER_FILES_UPDATE:
660 		ret = io_register_files_update(ctx, arg, nr_args);
661 		break;
662 	case IORING_REGISTER_EVENTFD:
663 		ret = -EINVAL;
664 		if (nr_args != 1)
665 			break;
666 		ret = io_eventfd_register(ctx, arg, 0);
667 		break;
668 	case IORING_REGISTER_EVENTFD_ASYNC:
669 		ret = -EINVAL;
670 		if (nr_args != 1)
671 			break;
672 		ret = io_eventfd_register(ctx, arg, 1);
673 		break;
674 	case IORING_UNREGISTER_EVENTFD:
675 		ret = -EINVAL;
676 		if (arg || nr_args)
677 			break;
678 		ret = io_eventfd_unregister(ctx);
679 		break;
680 	case IORING_REGISTER_PROBE:
681 		ret = -EINVAL;
682 		if (!arg || nr_args > 256)
683 			break;
684 		ret = io_probe(ctx, arg, nr_args);
685 		break;
686 	case IORING_REGISTER_PERSONALITY:
687 		ret = -EINVAL;
688 		if (arg || nr_args)
689 			break;
690 		ret = io_register_personality(ctx);
691 		break;
692 	case IORING_UNREGISTER_PERSONALITY:
693 		ret = -EINVAL;
694 		if (arg)
695 			break;
696 		ret = io_unregister_personality(ctx, nr_args);
697 		break;
698 	case IORING_REGISTER_ENABLE_RINGS:
699 		ret = -EINVAL;
700 		if (arg || nr_args)
701 			break;
702 		ret = io_register_enable_rings(ctx);
703 		break;
704 	case IORING_REGISTER_RESTRICTIONS:
705 		ret = io_register_restrictions(ctx, arg, nr_args);
706 		break;
707 	case IORING_REGISTER_FILES2:
708 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
709 		break;
710 	case IORING_REGISTER_FILES_UPDATE2:
711 		ret = io_register_rsrc_update(ctx, arg, nr_args,
712 					      IORING_RSRC_FILE);
713 		break;
714 	case IORING_REGISTER_BUFFERS2:
715 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
716 		break;
717 	case IORING_REGISTER_BUFFERS_UPDATE:
718 		ret = io_register_rsrc_update(ctx, arg, nr_args,
719 					      IORING_RSRC_BUFFER);
720 		break;
721 	case IORING_REGISTER_IOWQ_AFF:
722 		ret = -EINVAL;
723 		if (!arg || !nr_args)
724 			break;
725 		ret = io_register_iowq_aff(ctx, arg, nr_args);
726 		break;
727 	case IORING_UNREGISTER_IOWQ_AFF:
728 		ret = -EINVAL;
729 		if (arg || nr_args)
730 			break;
731 		ret = io_unregister_iowq_aff(ctx);
732 		break;
733 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
734 		ret = -EINVAL;
735 		if (!arg || nr_args != 2)
736 			break;
737 		ret = io_register_iowq_max_workers(ctx, arg);
738 		break;
739 	case IORING_REGISTER_RING_FDS:
740 		ret = io_ringfd_register(ctx, arg, nr_args);
741 		break;
742 	case IORING_UNREGISTER_RING_FDS:
743 		ret = io_ringfd_unregister(ctx, arg, nr_args);
744 		break;
745 	case IORING_REGISTER_PBUF_RING:
746 		ret = -EINVAL;
747 		if (!arg || nr_args != 1)
748 			break;
749 		ret = io_register_pbuf_ring(ctx, arg);
750 		break;
751 	case IORING_UNREGISTER_PBUF_RING:
752 		ret = -EINVAL;
753 		if (!arg || nr_args != 1)
754 			break;
755 		ret = io_unregister_pbuf_ring(ctx, arg);
756 		break;
757 	case IORING_REGISTER_SYNC_CANCEL:
758 		ret = -EINVAL;
759 		if (!arg || nr_args != 1)
760 			break;
761 		ret = io_sync_cancel(ctx, arg);
762 		break;
763 	case IORING_REGISTER_FILE_ALLOC_RANGE:
764 		ret = -EINVAL;
765 		if (!arg || nr_args)
766 			break;
767 		ret = io_register_file_alloc_range(ctx, arg);
768 		break;
769 	case IORING_REGISTER_PBUF_STATUS:
770 		ret = -EINVAL;
771 		if (!arg || nr_args != 1)
772 			break;
773 		ret = io_register_pbuf_status(ctx, arg);
774 		break;
775 	case IORING_REGISTER_NAPI:
776 		ret = -EINVAL;
777 		if (!arg || nr_args != 1)
778 			break;
779 		ret = io_register_napi(ctx, arg);
780 		break;
781 	case IORING_UNREGISTER_NAPI:
782 		ret = -EINVAL;
783 		if (nr_args != 1)
784 			break;
785 		ret = io_unregister_napi(ctx, arg);
786 		break;
787 	case IORING_REGISTER_CLOCK:
788 		ret = -EINVAL;
789 		if (!arg || nr_args)
790 			break;
791 		ret = io_register_clock(ctx, arg);
792 		break;
793 	case IORING_REGISTER_CLONE_BUFFERS:
794 		ret = -EINVAL;
795 		if (!arg || nr_args != 1)
796 			break;
797 		ret = io_register_clone_buffers(ctx, arg);
798 		break;
799 	case IORING_REGISTER_ZCRX_IFQ:
800 		ret = -EINVAL;
801 		if (!arg || nr_args != 1)
802 			break;
803 		ret = io_register_zcrx_ifq(ctx, arg);
804 		break;
805 	case IORING_REGISTER_RESIZE_RINGS:
806 		ret = -EINVAL;
807 		if (!arg || nr_args != 1)
808 			break;
809 		ret = io_register_resize_rings(ctx, arg);
810 		break;
811 	case IORING_REGISTER_MEM_REGION:
812 		ret = -EINVAL;
813 		if (!arg || nr_args != 1)
814 			break;
815 		ret = io_register_mem_region(ctx, arg);
816 		break;
817 	case IORING_REGISTER_QUERY:
818 		ret = io_query(arg, nr_args);
819 		break;
820 	case IORING_REGISTER_ZCRX_CTRL:
821 		ret = io_zcrx_ctrl(ctx, arg, nr_args);
822 		break;
823 	default:
824 		ret = -EINVAL;
825 		break;
826 	}
827 
828 	return ret;
829 }
830 
831 /*
832  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
833  * true, then the registered index is used. Otherwise, the normal fd table.
834  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
835  */
836 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
837 {
838 	struct file *file;
839 
840 	if (registered) {
841 		/*
842 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
843 		 * need only dereference our task private array to find it.
844 		 */
845 		struct io_uring_task *tctx = current->io_uring;
846 
847 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
848 			return ERR_PTR(-EINVAL);
849 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
850 		file = tctx->registered_rings[fd];
851 		if (file)
852 			get_file(file);
853 	} else {
854 		file = fget(fd);
855 	}
856 
857 	if (unlikely(!file))
858 		return ERR_PTR(-EBADF);
859 	if (io_is_uring_fops(file))
860 		return file;
861 	fput(file);
862 	return ERR_PTR(-EOPNOTSUPP);
863 }
864 
865 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
866 {
867 	struct io_uring_sqe sqe;
868 
869 	if (!arg || nr_args != 1)
870 		return -EINVAL;
871 	if (copy_from_user(&sqe, arg, sizeof(sqe)))
872 		return -EFAULT;
873 	/* no flags supported */
874 	if (sqe.flags)
875 		return -EINVAL;
876 	if (sqe.opcode != IORING_OP_MSG_RING)
877 		return -EINVAL;
878 
879 	return io_uring_sync_msg_ring(&sqe);
880 }
881 
882 /*
883  * "blind" registration opcodes are ones where there's no ring given, and
884  * hence the source fd must be -1.
885  */
886 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
887 				   unsigned int nr_args)
888 {
889 	switch (opcode) {
890 	case IORING_REGISTER_SEND_MSG_RING:
891 		return io_uring_register_send_msg_ring(arg, nr_args);
892 	case IORING_REGISTER_QUERY:
893 		return io_query(arg, nr_args);
894 	}
895 	return -EINVAL;
896 }
897 
898 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
899 		void __user *, arg, unsigned int, nr_args)
900 {
901 	struct io_ring_ctx *ctx;
902 	long ret = -EBADF;
903 	struct file *file;
904 	bool use_registered_ring;
905 
906 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
907 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
908 
909 	if (opcode >= IORING_REGISTER_LAST)
910 		return -EINVAL;
911 
912 	if (fd == -1)
913 		return io_uring_register_blind(opcode, arg, nr_args);
914 
915 	file = io_uring_register_get_file(fd, use_registered_ring);
916 	if (IS_ERR(file))
917 		return PTR_ERR(file);
918 	ctx = file->private_data;
919 
920 	mutex_lock(&ctx->uring_lock);
921 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
922 
923 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
924 				ctx->buf_table.nr, ret);
925 	mutex_unlock(&ctx->uring_lock);
926 
927 	fput(file);
928 	return ret;
929 }
930