xref: /linux/io_uring/register.c (revision 2c1ed907520c50326b8f604907a8478b27881a2e)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30 #include "eventfd.h"
31 #include "msg_ring.h"
32 #include "memmap.h"
33 
34 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
35 				 IORING_REGISTER_LAST + IORING_OP_LAST)
36 
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)37 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
38 			   unsigned nr_args)
39 {
40 	struct io_uring_probe *p;
41 	size_t size;
42 	int i, ret;
43 
44 	if (nr_args > IORING_OP_LAST)
45 		nr_args = IORING_OP_LAST;
46 
47 	size = struct_size(p, ops, nr_args);
48 	p = kzalloc(size, GFP_KERNEL);
49 	if (!p)
50 		return -ENOMEM;
51 
52 	ret = -EFAULT;
53 	if (copy_from_user(p, arg, size))
54 		goto out;
55 	ret = -EINVAL;
56 	if (memchr_inv(p, 0, size))
57 		goto out;
58 
59 	p->last_op = IORING_OP_LAST - 1;
60 
61 	for (i = 0; i < nr_args; i++) {
62 		p->ops[i].op = i;
63 		if (io_uring_op_supported(i))
64 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
65 	}
66 	p->ops_len = i;
67 
68 	ret = 0;
69 	if (copy_to_user(arg, p, size))
70 		ret = -EFAULT;
71 out:
72 	kfree(p);
73 	return ret;
74 }
75 
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
77 {
78 	const struct cred *creds;
79 
80 	creds = xa_erase(&ctx->personalities, id);
81 	if (creds) {
82 		put_cred(creds);
83 		return 0;
84 	}
85 
86 	return -EINVAL;
87 }
88 
89 
io_register_personality(struct io_ring_ctx * ctx)90 static int io_register_personality(struct io_ring_ctx *ctx)
91 {
92 	const struct cred *creds;
93 	u32 id;
94 	int ret;
95 
96 	creds = get_current_cred();
97 
98 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
99 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
100 	if (ret < 0) {
101 		put_cred(creds);
102 		return ret;
103 	}
104 	return id;
105 }
106 
io_parse_restrictions(void __user * arg,unsigned int nr_args,struct io_restriction * restrictions)107 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
108 					struct io_restriction *restrictions)
109 {
110 	struct io_uring_restriction *res;
111 	size_t size;
112 	int i, ret;
113 
114 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
115 		return -EINVAL;
116 
117 	size = array_size(nr_args, sizeof(*res));
118 	if (size == SIZE_MAX)
119 		return -EOVERFLOW;
120 
121 	res = memdup_user(arg, size);
122 	if (IS_ERR(res))
123 		return PTR_ERR(res);
124 
125 	ret = -EINVAL;
126 
127 	for (i = 0; i < nr_args; i++) {
128 		switch (res[i].opcode) {
129 		case IORING_RESTRICTION_REGISTER_OP:
130 			if (res[i].register_op >= IORING_REGISTER_LAST)
131 				goto err;
132 			__set_bit(res[i].register_op, restrictions->register_op);
133 			break;
134 		case IORING_RESTRICTION_SQE_OP:
135 			if (res[i].sqe_op >= IORING_OP_LAST)
136 				goto err;
137 			__set_bit(res[i].sqe_op, restrictions->sqe_op);
138 			break;
139 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
140 			restrictions->sqe_flags_allowed = res[i].sqe_flags;
141 			break;
142 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
143 			restrictions->sqe_flags_required = res[i].sqe_flags;
144 			break;
145 		default:
146 			goto err;
147 		}
148 	}
149 
150 	ret = 0;
151 
152 err:
153 	kfree(res);
154 	return ret;
155 }
156 
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)157 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
158 					   void __user *arg, unsigned int nr_args)
159 {
160 	int ret;
161 
162 	/* Restrictions allowed only if rings started disabled */
163 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
164 		return -EBADFD;
165 
166 	/* We allow only a single restrictions registration */
167 	if (ctx->restrictions.registered)
168 		return -EBUSY;
169 
170 	ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
171 	/* Reset all restrictions if an error happened */
172 	if (ret != 0)
173 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
174 	else
175 		ctx->restrictions.registered = true;
176 	return ret;
177 }
178 
io_register_enable_rings(struct io_ring_ctx * ctx)179 static int io_register_enable_rings(struct io_ring_ctx *ctx)
180 {
181 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
182 		return -EBADFD;
183 
184 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
185 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
186 		/*
187 		 * Lazy activation attempts would fail if it was polled before
188 		 * submitter_task is set.
189 		 */
190 		if (wq_has_sleeper(&ctx->poll_wq))
191 			io_activate_pollwq(ctx);
192 	}
193 
194 	if (ctx->restrictions.registered)
195 		ctx->restricted = 1;
196 
197 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
198 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
199 		wake_up(&ctx->sq_data->wait);
200 	return 0;
201 }
202 
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)203 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
204 					 cpumask_var_t new_mask)
205 {
206 	int ret;
207 
208 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
209 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
210 	} else {
211 		mutex_unlock(&ctx->uring_lock);
212 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
213 		mutex_lock(&ctx->uring_lock);
214 	}
215 
216 	return ret;
217 }
218 
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)219 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
220 				       void __user *arg, unsigned len)
221 {
222 	cpumask_var_t new_mask;
223 	int ret;
224 
225 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
226 		return -ENOMEM;
227 
228 	cpumask_clear(new_mask);
229 	if (len > cpumask_size())
230 		len = cpumask_size();
231 
232 #ifdef CONFIG_COMPAT
233 	if (in_compat_syscall())
234 		ret = compat_get_bitmap(cpumask_bits(new_mask),
235 					(const compat_ulong_t __user *)arg,
236 					len * 8 /* CHAR_BIT */);
237 	else
238 #endif
239 		ret = copy_from_user(new_mask, arg, len);
240 
241 	if (ret) {
242 		free_cpumask_var(new_mask);
243 		return -EFAULT;
244 	}
245 
246 	ret = __io_register_iowq_aff(ctx, new_mask);
247 	free_cpumask_var(new_mask);
248 	return ret;
249 }
250 
io_unregister_iowq_aff(struct io_ring_ctx * ctx)251 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
252 {
253 	return __io_register_iowq_aff(ctx, NULL);
254 }
255 
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)256 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
257 					       void __user *arg)
258 	__must_hold(&ctx->uring_lock)
259 {
260 	struct io_tctx_node *node;
261 	struct io_uring_task *tctx = NULL;
262 	struct io_sq_data *sqd = NULL;
263 	__u32 new_count[2];
264 	int i, ret;
265 
266 	if (copy_from_user(new_count, arg, sizeof(new_count)))
267 		return -EFAULT;
268 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
269 		if (new_count[i] > INT_MAX)
270 			return -EINVAL;
271 
272 	if (ctx->flags & IORING_SETUP_SQPOLL) {
273 		sqd = ctx->sq_data;
274 		if (sqd) {
275 			/*
276 			 * Observe the correct sqd->lock -> ctx->uring_lock
277 			 * ordering. Fine to drop uring_lock here, we hold
278 			 * a ref to the ctx.
279 			 */
280 			refcount_inc(&sqd->refs);
281 			mutex_unlock(&ctx->uring_lock);
282 			mutex_lock(&sqd->lock);
283 			mutex_lock(&ctx->uring_lock);
284 			if (sqd->thread)
285 				tctx = sqd->thread->io_uring;
286 		}
287 	} else {
288 		tctx = current->io_uring;
289 	}
290 
291 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
292 
293 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
294 		if (new_count[i])
295 			ctx->iowq_limits[i] = new_count[i];
296 	ctx->iowq_limits_set = true;
297 
298 	if (tctx && tctx->io_wq) {
299 		ret = io_wq_max_workers(tctx->io_wq, new_count);
300 		if (ret)
301 			goto err;
302 	} else {
303 		memset(new_count, 0, sizeof(new_count));
304 	}
305 
306 	if (sqd) {
307 		mutex_unlock(&ctx->uring_lock);
308 		mutex_unlock(&sqd->lock);
309 		io_put_sq_data(sqd);
310 		mutex_lock(&ctx->uring_lock);
311 	}
312 
313 	if (copy_to_user(arg, new_count, sizeof(new_count)))
314 		return -EFAULT;
315 
316 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
317 	if (sqd)
318 		return 0;
319 
320 	/* now propagate the restriction to all registered users */
321 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
322 		tctx = node->task->io_uring;
323 		if (WARN_ON_ONCE(!tctx->io_wq))
324 			continue;
325 
326 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
327 			new_count[i] = ctx->iowq_limits[i];
328 		/* ignore errors, it always returns zero anyway */
329 		(void)io_wq_max_workers(tctx->io_wq, new_count);
330 	}
331 	return 0;
332 err:
333 	if (sqd) {
334 		mutex_unlock(&ctx->uring_lock);
335 		mutex_unlock(&sqd->lock);
336 		io_put_sq_data(sqd);
337 		mutex_lock(&ctx->uring_lock);
338 	}
339 	return ret;
340 }
341 
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)342 static int io_register_clock(struct io_ring_ctx *ctx,
343 			     struct io_uring_clock_register __user *arg)
344 {
345 	struct io_uring_clock_register reg;
346 
347 	if (copy_from_user(&reg, arg, sizeof(reg)))
348 		return -EFAULT;
349 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
350 		return -EINVAL;
351 
352 	switch (reg.clockid) {
353 	case CLOCK_MONOTONIC:
354 		ctx->clock_offset = 0;
355 		break;
356 	case CLOCK_BOOTTIME:
357 		ctx->clock_offset = TK_OFFS_BOOT;
358 		break;
359 	default:
360 		return -EINVAL;
361 	}
362 
363 	ctx->clockid = reg.clockid;
364 	return 0;
365 }
366 
367 /*
368  * State to maintain until we can swap. Both new and old state, used for
369  * either mapping or freeing.
370  */
371 struct io_ring_ctx_rings {
372 	struct io_rings *rings;
373 	struct io_uring_sqe *sq_sqes;
374 
375 	struct io_mapped_region sq_region;
376 	struct io_mapped_region ring_region;
377 };
378 
io_register_free_rings(struct io_ring_ctx * ctx,struct io_uring_params * p,struct io_ring_ctx_rings * r)379 static void io_register_free_rings(struct io_ring_ctx *ctx,
380 				   struct io_uring_params *p,
381 				   struct io_ring_ctx_rings *r)
382 {
383 	io_free_region(ctx, &r->sq_region);
384 	io_free_region(ctx, &r->ring_region);
385 }
386 
387 #define swap_old(ctx, o, n, field)		\
388 	do {					\
389 		(o).field = (ctx)->field;	\
390 		(ctx)->field = (n).field;	\
391 	} while (0)
392 
393 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
394 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
395 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
396 
io_register_resize_rings(struct io_ring_ctx * ctx,void __user * arg)397 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
398 {
399 	struct io_uring_region_desc rd;
400 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
401 	size_t size, sq_array_offset;
402 	unsigned i, tail, old_head;
403 	struct io_uring_params p;
404 	int ret;
405 
406 	/* for single issuer, must be owner resizing */
407 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
408 	    current != ctx->submitter_task)
409 		return -EEXIST;
410 	/* limited to DEFER_TASKRUN for now */
411 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
412 		return -EINVAL;
413 	if (copy_from_user(&p, arg, sizeof(p)))
414 		return -EFAULT;
415 	if (p.flags & ~RESIZE_FLAGS)
416 		return -EINVAL;
417 
418 	/* properties that are always inherited */
419 	p.flags |= (ctx->flags & COPY_FLAGS);
420 
421 	ret = io_uring_fill_params(p.sq_entries, &p);
422 	if (unlikely(ret))
423 		return ret;
424 
425 	/* nothing to do, but copy params back */
426 	if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
427 		if (copy_to_user(arg, &p, sizeof(p)))
428 			return -EFAULT;
429 		return 0;
430 	}
431 
432 	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
433 				&sq_array_offset);
434 	if (size == SIZE_MAX)
435 		return -EOVERFLOW;
436 
437 	memset(&rd, 0, sizeof(rd));
438 	rd.size = PAGE_ALIGN(size);
439 	if (p.flags & IORING_SETUP_NO_MMAP) {
440 		rd.user_addr = p.cq_off.user_addr;
441 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
442 	}
443 	ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
444 	if (ret) {
445 		io_register_free_rings(ctx, &p, &n);
446 		return ret;
447 	}
448 	n.rings = io_region_get_ptr(&n.ring_region);
449 
450 	/*
451 	 * At this point n.rings is shared with userspace, just like o.rings
452 	 * is as well. While we don't expect userspace to modify it while
453 	 * a resize is in progress, and it's most likely that userspace will
454 	 * shoot itself in the foot if it does, we can't always assume good
455 	 * intent... Use read/write once helpers from here on to indicate the
456 	 * shared nature of it.
457 	 */
458 	WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
459 	WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
460 	WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
461 	WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
462 
463 	if (copy_to_user(arg, &p, sizeof(p))) {
464 		io_register_free_rings(ctx, &p, &n);
465 		return -EFAULT;
466 	}
467 
468 	if (p.flags & IORING_SETUP_SQE128)
469 		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
470 	else
471 		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
472 	if (size == SIZE_MAX) {
473 		io_register_free_rings(ctx, &p, &n);
474 		return -EOVERFLOW;
475 	}
476 
477 	memset(&rd, 0, sizeof(rd));
478 	rd.size = PAGE_ALIGN(size);
479 	if (p.flags & IORING_SETUP_NO_MMAP) {
480 		rd.user_addr = p.sq_off.user_addr;
481 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
482 	}
483 	ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
484 	if (ret) {
485 		io_register_free_rings(ctx, &p, &n);
486 		return ret;
487 	}
488 	n.sq_sqes = io_region_get_ptr(&n.sq_region);
489 
490 	/*
491 	 * If using SQPOLL, park the thread
492 	 */
493 	if (ctx->sq_data) {
494 		mutex_unlock(&ctx->uring_lock);
495 		io_sq_thread_park(ctx->sq_data);
496 		mutex_lock(&ctx->uring_lock);
497 	}
498 
499 	/*
500 	 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
501 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
502 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
503 	 * existing rings beyond this point will fail. Not that it could proceed
504 	 * at this point anyway, as the io_uring mmap side needs go grab the
505 	 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
506 	 * duration of the actual swap.
507 	 */
508 	mutex_lock(&ctx->mmap_lock);
509 	spin_lock(&ctx->completion_lock);
510 	o.rings = ctx->rings;
511 	ctx->rings = NULL;
512 	o.sq_sqes = ctx->sq_sqes;
513 	ctx->sq_sqes = NULL;
514 
515 	/*
516 	 * Now copy SQ and CQ entries, if any. If either of the destination
517 	 * rings can't hold what is already there, then fail the operation.
518 	 */
519 	tail = READ_ONCE(o.rings->sq.tail);
520 	old_head = READ_ONCE(o.rings->sq.head);
521 	if (tail - old_head > p.sq_entries)
522 		goto overflow;
523 	for (i = old_head; i < tail; i++) {
524 		unsigned src_head = i & (ctx->sq_entries - 1);
525 		unsigned dst_head = i & (p.sq_entries - 1);
526 
527 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
528 	}
529 	WRITE_ONCE(n.rings->sq.head, old_head);
530 	WRITE_ONCE(n.rings->sq.tail, tail);
531 
532 	tail = READ_ONCE(o.rings->cq.tail);
533 	old_head = READ_ONCE(o.rings->cq.head);
534 	if (tail - old_head > p.cq_entries) {
535 overflow:
536 		/* restore old rings, and return -EOVERFLOW via cleanup path */
537 		ctx->rings = o.rings;
538 		ctx->sq_sqes = o.sq_sqes;
539 		to_free = &n;
540 		ret = -EOVERFLOW;
541 		goto out;
542 	}
543 	for (i = old_head; i < tail; i++) {
544 		unsigned src_head = i & (ctx->cq_entries - 1);
545 		unsigned dst_head = i & (p.cq_entries - 1);
546 
547 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
548 	}
549 	WRITE_ONCE(n.rings->cq.head, old_head);
550 	WRITE_ONCE(n.rings->cq.tail, tail);
551 	/* invalidate cached cqe refill */
552 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
553 
554 	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
555 	atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
556 	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
557 	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
558 
559 	/* all done, store old pointers and assign new ones */
560 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
561 		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
562 
563 	ctx->sq_entries = p.sq_entries;
564 	ctx->cq_entries = p.cq_entries;
565 
566 	ctx->rings = n.rings;
567 	ctx->sq_sqes = n.sq_sqes;
568 	swap_old(ctx, o, n, ring_region);
569 	swap_old(ctx, o, n, sq_region);
570 	to_free = &o;
571 	ret = 0;
572 out:
573 	spin_unlock(&ctx->completion_lock);
574 	mutex_unlock(&ctx->mmap_lock);
575 	io_register_free_rings(ctx, &p, to_free);
576 
577 	if (ctx->sq_data)
578 		io_sq_thread_unpark(ctx->sq_data);
579 
580 	return ret;
581 }
582 
io_register_mem_region(struct io_ring_ctx * ctx,void __user * uarg)583 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
584 {
585 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
586 	struct io_uring_mem_region_reg reg;
587 	struct io_uring_region_desc __user *rd_uptr;
588 	struct io_uring_region_desc rd;
589 	int ret;
590 
591 	if (io_region_is_set(&ctx->param_region))
592 		return -EBUSY;
593 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
594 		return -EFAULT;
595 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
596 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
597 		return -EFAULT;
598 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
599 		return -EINVAL;
600 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
601 		return -EINVAL;
602 
603 	/*
604 	 * This ensures there are no waiters. Waiters are unlocked and it's
605 	 * hard to synchronise with them, especially if we need to initialise
606 	 * the region.
607 	 */
608 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
609 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
610 		return -EINVAL;
611 
612 	ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd,
613 					 IORING_MAP_OFF_PARAM_REGION);
614 	if (ret)
615 		return ret;
616 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
617 		io_free_region(ctx, &ctx->param_region);
618 		return -EFAULT;
619 	}
620 
621 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
622 		ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
623 		ctx->cq_wait_size = rd.size;
624 	}
625 	return 0;
626 }
627 
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)628 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
629 			       void __user *arg, unsigned nr_args)
630 	__releases(ctx->uring_lock)
631 	__acquires(ctx->uring_lock)
632 {
633 	int ret;
634 
635 	/*
636 	 * We don't quiesce the refs for register anymore and so it can't be
637 	 * dying as we're holding a file ref here.
638 	 */
639 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
640 		return -ENXIO;
641 
642 	if (ctx->submitter_task && ctx->submitter_task != current)
643 		return -EEXIST;
644 
645 	if (ctx->restricted) {
646 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
647 		if (!test_bit(opcode, ctx->restrictions.register_op))
648 			return -EACCES;
649 	}
650 
651 	switch (opcode) {
652 	case IORING_REGISTER_BUFFERS:
653 		ret = -EFAULT;
654 		if (!arg)
655 			break;
656 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
657 		break;
658 	case IORING_UNREGISTER_BUFFERS:
659 		ret = -EINVAL;
660 		if (arg || nr_args)
661 			break;
662 		ret = io_sqe_buffers_unregister(ctx);
663 		break;
664 	case IORING_REGISTER_FILES:
665 		ret = -EFAULT;
666 		if (!arg)
667 			break;
668 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
669 		break;
670 	case IORING_UNREGISTER_FILES:
671 		ret = -EINVAL;
672 		if (arg || nr_args)
673 			break;
674 		ret = io_sqe_files_unregister(ctx);
675 		break;
676 	case IORING_REGISTER_FILES_UPDATE:
677 		ret = io_register_files_update(ctx, arg, nr_args);
678 		break;
679 	case IORING_REGISTER_EVENTFD:
680 		ret = -EINVAL;
681 		if (nr_args != 1)
682 			break;
683 		ret = io_eventfd_register(ctx, arg, 0);
684 		break;
685 	case IORING_REGISTER_EVENTFD_ASYNC:
686 		ret = -EINVAL;
687 		if (nr_args != 1)
688 			break;
689 		ret = io_eventfd_register(ctx, arg, 1);
690 		break;
691 	case IORING_UNREGISTER_EVENTFD:
692 		ret = -EINVAL;
693 		if (arg || nr_args)
694 			break;
695 		ret = io_eventfd_unregister(ctx);
696 		break;
697 	case IORING_REGISTER_PROBE:
698 		ret = -EINVAL;
699 		if (!arg || nr_args > 256)
700 			break;
701 		ret = io_probe(ctx, arg, nr_args);
702 		break;
703 	case IORING_REGISTER_PERSONALITY:
704 		ret = -EINVAL;
705 		if (arg || nr_args)
706 			break;
707 		ret = io_register_personality(ctx);
708 		break;
709 	case IORING_UNREGISTER_PERSONALITY:
710 		ret = -EINVAL;
711 		if (arg)
712 			break;
713 		ret = io_unregister_personality(ctx, nr_args);
714 		break;
715 	case IORING_REGISTER_ENABLE_RINGS:
716 		ret = -EINVAL;
717 		if (arg || nr_args)
718 			break;
719 		ret = io_register_enable_rings(ctx);
720 		break;
721 	case IORING_REGISTER_RESTRICTIONS:
722 		ret = io_register_restrictions(ctx, arg, nr_args);
723 		break;
724 	case IORING_REGISTER_FILES2:
725 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
726 		break;
727 	case IORING_REGISTER_FILES_UPDATE2:
728 		ret = io_register_rsrc_update(ctx, arg, nr_args,
729 					      IORING_RSRC_FILE);
730 		break;
731 	case IORING_REGISTER_BUFFERS2:
732 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
733 		break;
734 	case IORING_REGISTER_BUFFERS_UPDATE:
735 		ret = io_register_rsrc_update(ctx, arg, nr_args,
736 					      IORING_RSRC_BUFFER);
737 		break;
738 	case IORING_REGISTER_IOWQ_AFF:
739 		ret = -EINVAL;
740 		if (!arg || !nr_args)
741 			break;
742 		ret = io_register_iowq_aff(ctx, arg, nr_args);
743 		break;
744 	case IORING_UNREGISTER_IOWQ_AFF:
745 		ret = -EINVAL;
746 		if (arg || nr_args)
747 			break;
748 		ret = io_unregister_iowq_aff(ctx);
749 		break;
750 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
751 		ret = -EINVAL;
752 		if (!arg || nr_args != 2)
753 			break;
754 		ret = io_register_iowq_max_workers(ctx, arg);
755 		break;
756 	case IORING_REGISTER_RING_FDS:
757 		ret = io_ringfd_register(ctx, arg, nr_args);
758 		break;
759 	case IORING_UNREGISTER_RING_FDS:
760 		ret = io_ringfd_unregister(ctx, arg, nr_args);
761 		break;
762 	case IORING_REGISTER_PBUF_RING:
763 		ret = -EINVAL;
764 		if (!arg || nr_args != 1)
765 			break;
766 		ret = io_register_pbuf_ring(ctx, arg);
767 		break;
768 	case IORING_UNREGISTER_PBUF_RING:
769 		ret = -EINVAL;
770 		if (!arg || nr_args != 1)
771 			break;
772 		ret = io_unregister_pbuf_ring(ctx, arg);
773 		break;
774 	case IORING_REGISTER_SYNC_CANCEL:
775 		ret = -EINVAL;
776 		if (!arg || nr_args != 1)
777 			break;
778 		ret = io_sync_cancel(ctx, arg);
779 		break;
780 	case IORING_REGISTER_FILE_ALLOC_RANGE:
781 		ret = -EINVAL;
782 		if (!arg || nr_args)
783 			break;
784 		ret = io_register_file_alloc_range(ctx, arg);
785 		break;
786 	case IORING_REGISTER_PBUF_STATUS:
787 		ret = -EINVAL;
788 		if (!arg || nr_args != 1)
789 			break;
790 		ret = io_register_pbuf_status(ctx, arg);
791 		break;
792 	case IORING_REGISTER_NAPI:
793 		ret = -EINVAL;
794 		if (!arg || nr_args != 1)
795 			break;
796 		ret = io_register_napi(ctx, arg);
797 		break;
798 	case IORING_UNREGISTER_NAPI:
799 		ret = -EINVAL;
800 		if (nr_args != 1)
801 			break;
802 		ret = io_unregister_napi(ctx, arg);
803 		break;
804 	case IORING_REGISTER_CLOCK:
805 		ret = -EINVAL;
806 		if (!arg || nr_args)
807 			break;
808 		ret = io_register_clock(ctx, arg);
809 		break;
810 	case IORING_REGISTER_CLONE_BUFFERS:
811 		ret = -EINVAL;
812 		if (!arg || nr_args != 1)
813 			break;
814 		ret = io_register_clone_buffers(ctx, arg);
815 		break;
816 	case IORING_REGISTER_RESIZE_RINGS:
817 		ret = -EINVAL;
818 		if (!arg || nr_args != 1)
819 			break;
820 		ret = io_register_resize_rings(ctx, arg);
821 		break;
822 	case IORING_REGISTER_MEM_REGION:
823 		ret = -EINVAL;
824 		if (!arg || nr_args != 1)
825 			break;
826 		ret = io_register_mem_region(ctx, arg);
827 		break;
828 	default:
829 		ret = -EINVAL;
830 		break;
831 	}
832 
833 	return ret;
834 }
835 
836 /*
837  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
838  * true, then the registered index is used. Otherwise, the normal fd table.
839  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
840  */
io_uring_register_get_file(unsigned int fd,bool registered)841 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
842 {
843 	struct file *file;
844 
845 	if (registered) {
846 		/*
847 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
848 		 * need only dereference our task private array to find it.
849 		 */
850 		struct io_uring_task *tctx = current->io_uring;
851 
852 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
853 			return ERR_PTR(-EINVAL);
854 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
855 		file = tctx->registered_rings[fd];
856 		if (file)
857 			get_file(file);
858 	} else {
859 		file = fget(fd);
860 	}
861 
862 	if (unlikely(!file))
863 		return ERR_PTR(-EBADF);
864 	if (io_is_uring_fops(file))
865 		return file;
866 	fput(file);
867 	return ERR_PTR(-EOPNOTSUPP);
868 }
869 
870 /*
871  * "blind" registration opcodes are ones where there's no ring given, and
872  * hence the source fd must be -1.
873  */
io_uring_register_blind(unsigned int opcode,void __user * arg,unsigned int nr_args)874 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
875 				   unsigned int nr_args)
876 {
877 	switch (opcode) {
878 	case IORING_REGISTER_SEND_MSG_RING: {
879 		struct io_uring_sqe sqe;
880 
881 		if (!arg || nr_args != 1)
882 			return -EINVAL;
883 		if (copy_from_user(&sqe, arg, sizeof(sqe)))
884 			return -EFAULT;
885 		/* no flags supported */
886 		if (sqe.flags)
887 			return -EINVAL;
888 		if (sqe.opcode == IORING_OP_MSG_RING)
889 			return io_uring_sync_msg_ring(&sqe);
890 		}
891 	}
892 
893 	return -EINVAL;
894 }
895 
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)896 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
897 		void __user *, arg, unsigned int, nr_args)
898 {
899 	struct io_ring_ctx *ctx;
900 	long ret = -EBADF;
901 	struct file *file;
902 	bool use_registered_ring;
903 
904 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
905 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
906 
907 	if (opcode >= IORING_REGISTER_LAST)
908 		return -EINVAL;
909 
910 	if (fd == -1)
911 		return io_uring_register_blind(opcode, arg, nr_args);
912 
913 	file = io_uring_register_get_file(fd, use_registered_ring);
914 	if (IS_ERR(file))
915 		return PTR_ERR(file);
916 	ctx = file->private_data;
917 
918 	mutex_lock(&ctx->uring_lock);
919 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
920 
921 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
922 				ctx->buf_table.nr, ret);
923 	mutex_unlock(&ctx->uring_lock);
924 
925 	fput(file);
926 	return ret;
927 }
928