xref: /linux/io_uring/register.c (revision a4863e002cf0dd6fb2f06796f16d7bc0974e9845)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30 #include "eventfd.h"
31 #include "msg_ring.h"
32 #include "memmap.h"
33 
34 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
35 				 IORING_REGISTER_LAST + IORING_OP_LAST)
36 
37 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
38 			   unsigned nr_args)
39 {
40 	struct io_uring_probe *p;
41 	size_t size;
42 	int i, ret;
43 
44 	if (nr_args > IORING_OP_LAST)
45 		nr_args = IORING_OP_LAST;
46 
47 	size = struct_size(p, ops, nr_args);
48 	p = kzalloc(size, GFP_KERNEL);
49 	if (!p)
50 		return -ENOMEM;
51 
52 	ret = -EFAULT;
53 	if (copy_from_user(p, arg, size))
54 		goto out;
55 	ret = -EINVAL;
56 	if (memchr_inv(p, 0, size))
57 		goto out;
58 
59 	p->last_op = IORING_OP_LAST - 1;
60 
61 	for (i = 0; i < nr_args; i++) {
62 		p->ops[i].op = i;
63 		if (io_uring_op_supported(i))
64 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
65 	}
66 	p->ops_len = i;
67 
68 	ret = 0;
69 	if (copy_to_user(arg, p, size))
70 		ret = -EFAULT;
71 out:
72 	kfree(p);
73 	return ret;
74 }
75 
76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
77 {
78 	const struct cred *creds;
79 
80 	creds = xa_erase(&ctx->personalities, id);
81 	if (creds) {
82 		put_cred(creds);
83 		return 0;
84 	}
85 
86 	return -EINVAL;
87 }
88 
89 
90 static int io_register_personality(struct io_ring_ctx *ctx)
91 {
92 	const struct cred *creds;
93 	u32 id;
94 	int ret;
95 
96 	creds = get_current_cred();
97 
98 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
99 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
100 	if (ret < 0) {
101 		put_cred(creds);
102 		return ret;
103 	}
104 	return id;
105 }
106 
107 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
108 					   void __user *arg, unsigned int nr_args)
109 {
110 	struct io_uring_restriction *res;
111 	size_t size;
112 	int i, ret;
113 
114 	/* Restrictions allowed only if rings started disabled */
115 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
116 		return -EBADFD;
117 
118 	/* We allow only a single restrictions registration */
119 	if (ctx->restrictions.registered)
120 		return -EBUSY;
121 
122 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
123 		return -EINVAL;
124 
125 	size = array_size(nr_args, sizeof(*res));
126 	if (size == SIZE_MAX)
127 		return -EOVERFLOW;
128 
129 	res = memdup_user(arg, size);
130 	if (IS_ERR(res))
131 		return PTR_ERR(res);
132 
133 	ret = 0;
134 
135 	for (i = 0; i < nr_args; i++) {
136 		switch (res[i].opcode) {
137 		case IORING_RESTRICTION_REGISTER_OP:
138 			if (res[i].register_op >= IORING_REGISTER_LAST) {
139 				ret = -EINVAL;
140 				goto out;
141 			}
142 
143 			__set_bit(res[i].register_op,
144 				  ctx->restrictions.register_op);
145 			break;
146 		case IORING_RESTRICTION_SQE_OP:
147 			if (res[i].sqe_op >= IORING_OP_LAST) {
148 				ret = -EINVAL;
149 				goto out;
150 			}
151 
152 			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
153 			break;
154 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
155 			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
156 			break;
157 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
158 			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
159 			break;
160 		default:
161 			ret = -EINVAL;
162 			goto out;
163 		}
164 	}
165 
166 out:
167 	/* Reset all restrictions if an error happened */
168 	if (ret != 0)
169 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
170 	else
171 		ctx->restrictions.registered = true;
172 
173 	kfree(res);
174 	return ret;
175 }
176 
177 static int io_register_enable_rings(struct io_ring_ctx *ctx)
178 {
179 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
180 		return -EBADFD;
181 
182 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
183 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
184 		/*
185 		 * Lazy activation attempts would fail if it was polled before
186 		 * submitter_task is set.
187 		 */
188 		if (wq_has_sleeper(&ctx->poll_wq))
189 			io_activate_pollwq(ctx);
190 	}
191 
192 	if (ctx->restrictions.registered)
193 		ctx->restricted = 1;
194 
195 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
196 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
197 		wake_up(&ctx->sq_data->wait);
198 	return 0;
199 }
200 
201 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
202 					 cpumask_var_t new_mask)
203 {
204 	int ret;
205 
206 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
207 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
208 	} else {
209 		mutex_unlock(&ctx->uring_lock);
210 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
211 		mutex_lock(&ctx->uring_lock);
212 	}
213 
214 	return ret;
215 }
216 
217 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
218 				       void __user *arg, unsigned len)
219 {
220 	cpumask_var_t new_mask;
221 	int ret;
222 
223 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
224 		return -ENOMEM;
225 
226 	cpumask_clear(new_mask);
227 	if (len > cpumask_size())
228 		len = cpumask_size();
229 
230 #ifdef CONFIG_COMPAT
231 	if (in_compat_syscall())
232 		ret = compat_get_bitmap(cpumask_bits(new_mask),
233 					(const compat_ulong_t __user *)arg,
234 					len * 8 /* CHAR_BIT */);
235 	else
236 #endif
237 		ret = copy_from_user(new_mask, arg, len);
238 
239 	if (ret) {
240 		free_cpumask_var(new_mask);
241 		return -EFAULT;
242 	}
243 
244 	ret = __io_register_iowq_aff(ctx, new_mask);
245 	free_cpumask_var(new_mask);
246 	return ret;
247 }
248 
249 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
250 {
251 	return __io_register_iowq_aff(ctx, NULL);
252 }
253 
254 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
255 					       void __user *arg)
256 	__must_hold(&ctx->uring_lock)
257 {
258 	struct io_tctx_node *node;
259 	struct io_uring_task *tctx = NULL;
260 	struct io_sq_data *sqd = NULL;
261 	__u32 new_count[2];
262 	int i, ret;
263 
264 	if (copy_from_user(new_count, arg, sizeof(new_count)))
265 		return -EFAULT;
266 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
267 		if (new_count[i] > INT_MAX)
268 			return -EINVAL;
269 
270 	if (ctx->flags & IORING_SETUP_SQPOLL) {
271 		sqd = ctx->sq_data;
272 		if (sqd) {
273 			/*
274 			 * Observe the correct sqd->lock -> ctx->uring_lock
275 			 * ordering. Fine to drop uring_lock here, we hold
276 			 * a ref to the ctx.
277 			 */
278 			refcount_inc(&sqd->refs);
279 			mutex_unlock(&ctx->uring_lock);
280 			mutex_lock(&sqd->lock);
281 			mutex_lock(&ctx->uring_lock);
282 			if (sqd->thread)
283 				tctx = sqd->thread->io_uring;
284 		}
285 	} else {
286 		tctx = current->io_uring;
287 	}
288 
289 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
290 
291 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
292 		if (new_count[i])
293 			ctx->iowq_limits[i] = new_count[i];
294 	ctx->iowq_limits_set = true;
295 
296 	if (tctx && tctx->io_wq) {
297 		ret = io_wq_max_workers(tctx->io_wq, new_count);
298 		if (ret)
299 			goto err;
300 	} else {
301 		memset(new_count, 0, sizeof(new_count));
302 	}
303 
304 	if (sqd) {
305 		mutex_unlock(&ctx->uring_lock);
306 		mutex_unlock(&sqd->lock);
307 		io_put_sq_data(sqd);
308 		mutex_lock(&ctx->uring_lock);
309 	}
310 
311 	if (copy_to_user(arg, new_count, sizeof(new_count)))
312 		return -EFAULT;
313 
314 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
315 	if (sqd)
316 		return 0;
317 
318 	/* now propagate the restriction to all registered users */
319 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
320 		tctx = node->task->io_uring;
321 		if (WARN_ON_ONCE(!tctx->io_wq))
322 			continue;
323 
324 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
325 			new_count[i] = ctx->iowq_limits[i];
326 		/* ignore errors, it always returns zero anyway */
327 		(void)io_wq_max_workers(tctx->io_wq, new_count);
328 	}
329 	return 0;
330 err:
331 	if (sqd) {
332 		mutex_unlock(&ctx->uring_lock);
333 		mutex_unlock(&sqd->lock);
334 		io_put_sq_data(sqd);
335 		mutex_lock(&ctx->uring_lock);
336 	}
337 	return ret;
338 }
339 
340 static int io_register_clock(struct io_ring_ctx *ctx,
341 			     struct io_uring_clock_register __user *arg)
342 {
343 	struct io_uring_clock_register reg;
344 
345 	if (copy_from_user(&reg, arg, sizeof(reg)))
346 		return -EFAULT;
347 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
348 		return -EINVAL;
349 
350 	switch (reg.clockid) {
351 	case CLOCK_MONOTONIC:
352 		ctx->clock_offset = 0;
353 		break;
354 	case CLOCK_BOOTTIME:
355 		ctx->clock_offset = TK_OFFS_BOOT;
356 		break;
357 	default:
358 		return -EINVAL;
359 	}
360 
361 	ctx->clockid = reg.clockid;
362 	return 0;
363 }
364 
365 /*
366  * State to maintain until we can swap. Both new and old state, used for
367  * either mapping or freeing.
368  */
369 struct io_ring_ctx_rings {
370 	unsigned short n_ring_pages;
371 	unsigned short n_sqe_pages;
372 	struct page **ring_pages;
373 	struct page **sqe_pages;
374 	struct io_uring_sqe *sq_sqes;
375 	struct io_rings *rings;
376 };
377 
378 static void io_register_free_rings(struct io_uring_params *p,
379 				   struct io_ring_ctx_rings *r)
380 {
381 	if (!(p->flags & IORING_SETUP_NO_MMAP)) {
382 		io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
383 				true);
384 		io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
385 				true);
386 	} else {
387 		io_pages_free(&r->ring_pages, r->n_ring_pages);
388 		io_pages_free(&r->sqe_pages, r->n_sqe_pages);
389 		vunmap(r->rings);
390 		vunmap(r->sq_sqes);
391 	}
392 }
393 
394 #define swap_old(ctx, o, n, field)		\
395 	do {					\
396 		(o).field = (ctx)->field;	\
397 		(ctx)->field = (n).field;	\
398 	} while (0)
399 
400 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
401 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
402 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
403 
404 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
405 {
406 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
407 	size_t size, sq_array_offset;
408 	unsigned i, tail, old_head;
409 	struct io_uring_params p;
410 	void *ptr;
411 	int ret;
412 
413 	/* for single issuer, must be owner resizing */
414 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
415 	    current != ctx->submitter_task)
416 		return -EEXIST;
417 	/* limited to DEFER_TASKRUN for now */
418 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
419 		return -EINVAL;
420 	if (copy_from_user(&p, arg, sizeof(p)))
421 		return -EFAULT;
422 	if (p.flags & ~RESIZE_FLAGS)
423 		return -EINVAL;
424 
425 	/* properties that are always inherited */
426 	p.flags |= (ctx->flags & COPY_FLAGS);
427 
428 	ret = io_uring_fill_params(p.sq_entries, &p);
429 	if (unlikely(ret))
430 		return ret;
431 
432 	/* nothing to do, but copy params back */
433 	if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
434 		if (copy_to_user(arg, &p, sizeof(p)))
435 			return -EFAULT;
436 		return 0;
437 	}
438 
439 	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
440 				&sq_array_offset);
441 	if (size == SIZE_MAX)
442 		return -EOVERFLOW;
443 
444 	if (!(p.flags & IORING_SETUP_NO_MMAP))
445 		n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
446 	else
447 		n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
448 						p.cq_off.user_addr, size);
449 	if (IS_ERR(n.rings))
450 		return PTR_ERR(n.rings);
451 
452 	/*
453 	 * At this point n.rings is shared with userspace, just like o.rings
454 	 * is as well. While we don't expect userspace to modify it while
455 	 * a resize is in progress, and it's most likely that userspace will
456 	 * shoot itself in the foot if it does, we can't always assume good
457 	 * intent... Use read/write once helpers from here on to indicate the
458 	 * shared nature of it.
459 	 */
460 	WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
461 	WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
462 	WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
463 	WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
464 
465 	if (copy_to_user(arg, &p, sizeof(p))) {
466 		io_register_free_rings(&p, &n);
467 		return -EFAULT;
468 	}
469 
470 	if (p.flags & IORING_SETUP_SQE128)
471 		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
472 	else
473 		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
474 	if (size == SIZE_MAX) {
475 		io_register_free_rings(&p, &n);
476 		return -EOVERFLOW;
477 	}
478 
479 	if (!(p.flags & IORING_SETUP_NO_MMAP))
480 		ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
481 	else
482 		ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
483 					p.sq_off.user_addr,
484 					size);
485 	if (IS_ERR(ptr)) {
486 		io_register_free_rings(&p, &n);
487 		return PTR_ERR(ptr);
488 	}
489 
490 	/*
491 	 * If using SQPOLL, park the thread
492 	 */
493 	if (ctx->sq_data) {
494 		mutex_unlock(&ctx->uring_lock);
495 		io_sq_thread_park(ctx->sq_data);
496 		mutex_lock(&ctx->uring_lock);
497 	}
498 
499 	/*
500 	 * We'll do the swap. Grab the ctx->resize_lock, which will exclude
501 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
502 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
503 	 * existing rings beyond this point will fail. Not that it could proceed
504 	 * at this point anyway, as the io_uring mmap side needs go grab the
505 	 * ctx->resize_lock as well. Likewise, hold the completion lock over the
506 	 * duration of the actual swap.
507 	 */
508 	mutex_lock(&ctx->resize_lock);
509 	spin_lock(&ctx->completion_lock);
510 	o.rings = ctx->rings;
511 	ctx->rings = NULL;
512 	o.sq_sqes = ctx->sq_sqes;
513 	ctx->sq_sqes = NULL;
514 
515 	/*
516 	 * Now copy SQ and CQ entries, if any. If either of the destination
517 	 * rings can't hold what is already there, then fail the operation.
518 	 */
519 	n.sq_sqes = ptr;
520 	tail = READ_ONCE(o.rings->sq.tail);
521 	old_head = READ_ONCE(o.rings->sq.head);
522 	if (tail - old_head > p.sq_entries)
523 		goto overflow;
524 	for (i = old_head; i < tail; i++) {
525 		unsigned src_head = i & (ctx->sq_entries - 1);
526 		unsigned dst_head = i & (p.sq_entries - 1);
527 
528 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
529 	}
530 	WRITE_ONCE(n.rings->sq.head, READ_ONCE(o.rings->sq.head));
531 	WRITE_ONCE(n.rings->sq.tail, READ_ONCE(o.rings->sq.tail));
532 
533 	tail = READ_ONCE(o.rings->cq.tail);
534 	old_head = READ_ONCE(o.rings->cq.head);
535 	if (tail - old_head > p.cq_entries) {
536 overflow:
537 		/* restore old rings, and return -EOVERFLOW via cleanup path */
538 		ctx->rings = o.rings;
539 		ctx->sq_sqes = o.sq_sqes;
540 		to_free = &n;
541 		ret = -EOVERFLOW;
542 		goto out;
543 	}
544 	for (i = old_head; i < tail; i++) {
545 		unsigned src_head = i & (ctx->cq_entries - 1);
546 		unsigned dst_head = i & (p.cq_entries - 1);
547 
548 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
549 	}
550 	WRITE_ONCE(n.rings->cq.head, READ_ONCE(o.rings->cq.head));
551 	WRITE_ONCE(n.rings->cq.tail, READ_ONCE(o.rings->cq.tail));
552 	/* invalidate cached cqe refill */
553 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
554 
555 	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
556 	WRITE_ONCE(n.rings->sq_flags, READ_ONCE(o.rings->sq_flags));
557 	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
558 	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
559 
560 	/* all done, store old pointers and assign new ones */
561 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
562 		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
563 
564 	ctx->sq_entries = p.sq_entries;
565 	ctx->cq_entries = p.cq_entries;
566 
567 	ctx->rings = n.rings;
568 	ctx->sq_sqes = n.sq_sqes;
569 	swap_old(ctx, o, n, n_ring_pages);
570 	swap_old(ctx, o, n, n_sqe_pages);
571 	swap_old(ctx, o, n, ring_pages);
572 	swap_old(ctx, o, n, sqe_pages);
573 	to_free = &o;
574 	ret = 0;
575 out:
576 	spin_unlock(&ctx->completion_lock);
577 	mutex_unlock(&ctx->resize_lock);
578 	io_register_free_rings(&p, to_free);
579 
580 	if (ctx->sq_data)
581 		io_sq_thread_unpark(ctx->sq_data);
582 
583 	return ret;
584 }
585 
586 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
587 {
588 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
589 	struct io_uring_mem_region_reg reg;
590 	struct io_uring_region_desc __user *rd_uptr;
591 	struct io_uring_region_desc rd;
592 	int ret;
593 
594 	if (io_region_is_set(&ctx->param_region))
595 		return -EBUSY;
596 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
597 		return -EFAULT;
598 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
599 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
600 		return -EFAULT;
601 
602 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
603 		return -EINVAL;
604 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
605 		return -EINVAL;
606 
607 	/*
608 	 * This ensures there are no waiters. Waiters are unlocked and it's
609 	 * hard to synchronise with them, especially if we need to initialise
610 	 * the region.
611 	 */
612 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
613 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
614 		return -EINVAL;
615 
616 	ret = io_create_region(ctx, &ctx->param_region, &rd);
617 	if (ret)
618 		return ret;
619 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
620 		io_free_region(ctx, &ctx->param_region);
621 		return -EFAULT;
622 	}
623 
624 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
625 		ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
626 		ctx->cq_wait_size = rd.size;
627 	}
628 	return 0;
629 }
630 
631 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
632 			       void __user *arg, unsigned nr_args)
633 	__releases(ctx->uring_lock)
634 	__acquires(ctx->uring_lock)
635 {
636 	int ret;
637 
638 	/*
639 	 * We don't quiesce the refs for register anymore and so it can't be
640 	 * dying as we're holding a file ref here.
641 	 */
642 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
643 		return -ENXIO;
644 
645 	if (ctx->submitter_task && ctx->submitter_task != current)
646 		return -EEXIST;
647 
648 	if (ctx->restricted) {
649 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
650 		if (!test_bit(opcode, ctx->restrictions.register_op))
651 			return -EACCES;
652 	}
653 
654 	switch (opcode) {
655 	case IORING_REGISTER_BUFFERS:
656 		ret = -EFAULT;
657 		if (!arg)
658 			break;
659 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
660 		break;
661 	case IORING_UNREGISTER_BUFFERS:
662 		ret = -EINVAL;
663 		if (arg || nr_args)
664 			break;
665 		ret = io_sqe_buffers_unregister(ctx);
666 		break;
667 	case IORING_REGISTER_FILES:
668 		ret = -EFAULT;
669 		if (!arg)
670 			break;
671 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
672 		break;
673 	case IORING_UNREGISTER_FILES:
674 		ret = -EINVAL;
675 		if (arg || nr_args)
676 			break;
677 		ret = io_sqe_files_unregister(ctx);
678 		break;
679 	case IORING_REGISTER_FILES_UPDATE:
680 		ret = io_register_files_update(ctx, arg, nr_args);
681 		break;
682 	case IORING_REGISTER_EVENTFD:
683 		ret = -EINVAL;
684 		if (nr_args != 1)
685 			break;
686 		ret = io_eventfd_register(ctx, arg, 0);
687 		break;
688 	case IORING_REGISTER_EVENTFD_ASYNC:
689 		ret = -EINVAL;
690 		if (nr_args != 1)
691 			break;
692 		ret = io_eventfd_register(ctx, arg, 1);
693 		break;
694 	case IORING_UNREGISTER_EVENTFD:
695 		ret = -EINVAL;
696 		if (arg || nr_args)
697 			break;
698 		ret = io_eventfd_unregister(ctx);
699 		break;
700 	case IORING_REGISTER_PROBE:
701 		ret = -EINVAL;
702 		if (!arg || nr_args > 256)
703 			break;
704 		ret = io_probe(ctx, arg, nr_args);
705 		break;
706 	case IORING_REGISTER_PERSONALITY:
707 		ret = -EINVAL;
708 		if (arg || nr_args)
709 			break;
710 		ret = io_register_personality(ctx);
711 		break;
712 	case IORING_UNREGISTER_PERSONALITY:
713 		ret = -EINVAL;
714 		if (arg)
715 			break;
716 		ret = io_unregister_personality(ctx, nr_args);
717 		break;
718 	case IORING_REGISTER_ENABLE_RINGS:
719 		ret = -EINVAL;
720 		if (arg || nr_args)
721 			break;
722 		ret = io_register_enable_rings(ctx);
723 		break;
724 	case IORING_REGISTER_RESTRICTIONS:
725 		ret = io_register_restrictions(ctx, arg, nr_args);
726 		break;
727 	case IORING_REGISTER_FILES2:
728 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
729 		break;
730 	case IORING_REGISTER_FILES_UPDATE2:
731 		ret = io_register_rsrc_update(ctx, arg, nr_args,
732 					      IORING_RSRC_FILE);
733 		break;
734 	case IORING_REGISTER_BUFFERS2:
735 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
736 		break;
737 	case IORING_REGISTER_BUFFERS_UPDATE:
738 		ret = io_register_rsrc_update(ctx, arg, nr_args,
739 					      IORING_RSRC_BUFFER);
740 		break;
741 	case IORING_REGISTER_IOWQ_AFF:
742 		ret = -EINVAL;
743 		if (!arg || !nr_args)
744 			break;
745 		ret = io_register_iowq_aff(ctx, arg, nr_args);
746 		break;
747 	case IORING_UNREGISTER_IOWQ_AFF:
748 		ret = -EINVAL;
749 		if (arg || nr_args)
750 			break;
751 		ret = io_unregister_iowq_aff(ctx);
752 		break;
753 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
754 		ret = -EINVAL;
755 		if (!arg || nr_args != 2)
756 			break;
757 		ret = io_register_iowq_max_workers(ctx, arg);
758 		break;
759 	case IORING_REGISTER_RING_FDS:
760 		ret = io_ringfd_register(ctx, arg, nr_args);
761 		break;
762 	case IORING_UNREGISTER_RING_FDS:
763 		ret = io_ringfd_unregister(ctx, arg, nr_args);
764 		break;
765 	case IORING_REGISTER_PBUF_RING:
766 		ret = -EINVAL;
767 		if (!arg || nr_args != 1)
768 			break;
769 		ret = io_register_pbuf_ring(ctx, arg);
770 		break;
771 	case IORING_UNREGISTER_PBUF_RING:
772 		ret = -EINVAL;
773 		if (!arg || nr_args != 1)
774 			break;
775 		ret = io_unregister_pbuf_ring(ctx, arg);
776 		break;
777 	case IORING_REGISTER_SYNC_CANCEL:
778 		ret = -EINVAL;
779 		if (!arg || nr_args != 1)
780 			break;
781 		ret = io_sync_cancel(ctx, arg);
782 		break;
783 	case IORING_REGISTER_FILE_ALLOC_RANGE:
784 		ret = -EINVAL;
785 		if (!arg || nr_args)
786 			break;
787 		ret = io_register_file_alloc_range(ctx, arg);
788 		break;
789 	case IORING_REGISTER_PBUF_STATUS:
790 		ret = -EINVAL;
791 		if (!arg || nr_args != 1)
792 			break;
793 		ret = io_register_pbuf_status(ctx, arg);
794 		break;
795 	case IORING_REGISTER_NAPI:
796 		ret = -EINVAL;
797 		if (!arg || nr_args != 1)
798 			break;
799 		ret = io_register_napi(ctx, arg);
800 		break;
801 	case IORING_UNREGISTER_NAPI:
802 		ret = -EINVAL;
803 		if (nr_args != 1)
804 			break;
805 		ret = io_unregister_napi(ctx, arg);
806 		break;
807 	case IORING_REGISTER_CLOCK:
808 		ret = -EINVAL;
809 		if (!arg || nr_args)
810 			break;
811 		ret = io_register_clock(ctx, arg);
812 		break;
813 	case IORING_REGISTER_CLONE_BUFFERS:
814 		ret = -EINVAL;
815 		if (!arg || nr_args != 1)
816 			break;
817 		ret = io_register_clone_buffers(ctx, arg);
818 		break;
819 	case IORING_REGISTER_RESIZE_RINGS:
820 		ret = -EINVAL;
821 		if (!arg || nr_args != 1)
822 			break;
823 		ret = io_register_resize_rings(ctx, arg);
824 		break;
825 	case IORING_REGISTER_MEM_REGION:
826 		ret = -EINVAL;
827 		if (!arg || nr_args != 1)
828 			break;
829 		ret = io_register_mem_region(ctx, arg);
830 		break;
831 	default:
832 		ret = -EINVAL;
833 		break;
834 	}
835 
836 	return ret;
837 }
838 
839 /*
840  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
841  * true, then the registered index is used. Otherwise, the normal fd table.
842  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
843  */
844 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
845 {
846 	struct file *file;
847 
848 	if (registered) {
849 		/*
850 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
851 		 * need only dereference our task private array to find it.
852 		 */
853 		struct io_uring_task *tctx = current->io_uring;
854 
855 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
856 			return ERR_PTR(-EINVAL);
857 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
858 		file = tctx->registered_rings[fd];
859 	} else {
860 		file = fget(fd);
861 	}
862 
863 	if (unlikely(!file))
864 		return ERR_PTR(-EBADF);
865 	if (io_is_uring_fops(file))
866 		return file;
867 	fput(file);
868 	return ERR_PTR(-EOPNOTSUPP);
869 }
870 
871 /*
872  * "blind" registration opcodes are ones where there's no ring given, and
873  * hence the source fd must be -1.
874  */
875 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
876 				   unsigned int nr_args)
877 {
878 	switch (opcode) {
879 	case IORING_REGISTER_SEND_MSG_RING: {
880 		struct io_uring_sqe sqe;
881 
882 		if (!arg || nr_args != 1)
883 			return -EINVAL;
884 		if (copy_from_user(&sqe, arg, sizeof(sqe)))
885 			return -EFAULT;
886 		/* no flags supported */
887 		if (sqe.flags)
888 			return -EINVAL;
889 		if (sqe.opcode == IORING_OP_MSG_RING)
890 			return io_uring_sync_msg_ring(&sqe);
891 		}
892 	}
893 
894 	return -EINVAL;
895 }
896 
897 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
898 		void __user *, arg, unsigned int, nr_args)
899 {
900 	struct io_ring_ctx *ctx;
901 	long ret = -EBADF;
902 	struct file *file;
903 	bool use_registered_ring;
904 
905 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
906 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
907 
908 	if (opcode >= IORING_REGISTER_LAST)
909 		return -EINVAL;
910 
911 	if (fd == -1)
912 		return io_uring_register_blind(opcode, arg, nr_args);
913 
914 	file = io_uring_register_get_file(fd, use_registered_ring);
915 	if (IS_ERR(file))
916 		return PTR_ERR(file);
917 	ctx = file->private_data;
918 
919 	mutex_lock(&ctx->uring_lock);
920 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
921 
922 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
923 				ctx->buf_table.nr, ret);
924 	mutex_unlock(&ctx->uring_lock);
925 	if (!use_registered_ring)
926 		fput(file);
927 	return ret;
928 }
929