xref: /linux/io_uring/register.c (revision 3f1c07fc21c68bd3bd2df9d2c9441f6485e934d9)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "filetable.h"
22 #include "io_uring.h"
23 #include "opdef.h"
24 #include "tctx.h"
25 #include "rsrc.h"
26 #include "sqpoll.h"
27 #include "register.h"
28 #include "cancel.h"
29 #include "kbuf.h"
30 #include "napi.h"
31 #include "eventfd.h"
32 #include "msg_ring.h"
33 #include "memmap.h"
34 #include "zcrx.h"
35 #include "query.h"
36 
37 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
38 				 IORING_REGISTER_LAST + IORING_OP_LAST)
39 
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)40 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
41 			   unsigned nr_args)
42 {
43 	struct io_uring_probe *p;
44 	size_t size;
45 	int i, ret;
46 
47 	if (nr_args > IORING_OP_LAST)
48 		nr_args = IORING_OP_LAST;
49 
50 	size = struct_size(p, ops, nr_args);
51 	p = memdup_user(arg, size);
52 	if (IS_ERR(p))
53 		return PTR_ERR(p);
54 	ret = -EINVAL;
55 	if (memchr_inv(p, 0, size))
56 		goto out;
57 
58 	p->last_op = IORING_OP_LAST - 1;
59 
60 	for (i = 0; i < nr_args; i++) {
61 		p->ops[i].op = i;
62 		if (io_uring_op_supported(i))
63 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
64 	}
65 	p->ops_len = i;
66 
67 	ret = 0;
68 	if (copy_to_user(arg, p, size))
69 		ret = -EFAULT;
70 out:
71 	kfree(p);
72 	return ret;
73 }
74 
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)75 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
76 {
77 	const struct cred *creds;
78 
79 	creds = xa_erase(&ctx->personalities, id);
80 	if (creds) {
81 		put_cred(creds);
82 		return 0;
83 	}
84 
85 	return -EINVAL;
86 }
87 
88 
io_register_personality(struct io_ring_ctx * ctx)89 static int io_register_personality(struct io_ring_ctx *ctx)
90 {
91 	const struct cred *creds;
92 	u32 id;
93 	int ret;
94 
95 	creds = get_current_cred();
96 
97 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
98 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
99 	if (ret < 0) {
100 		put_cred(creds);
101 		return ret;
102 	}
103 	return id;
104 }
105 
io_parse_restrictions(void __user * arg,unsigned int nr_args,struct io_restriction * restrictions)106 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
107 					struct io_restriction *restrictions)
108 {
109 	struct io_uring_restriction *res;
110 	size_t size;
111 	int i, ret;
112 
113 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
114 		return -EINVAL;
115 
116 	size = array_size(nr_args, sizeof(*res));
117 	if (size == SIZE_MAX)
118 		return -EOVERFLOW;
119 
120 	res = memdup_user(arg, size);
121 	if (IS_ERR(res))
122 		return PTR_ERR(res);
123 
124 	ret = -EINVAL;
125 
126 	for (i = 0; i < nr_args; i++) {
127 		switch (res[i].opcode) {
128 		case IORING_RESTRICTION_REGISTER_OP:
129 			if (res[i].register_op >= IORING_REGISTER_LAST)
130 				goto err;
131 			__set_bit(res[i].register_op, restrictions->register_op);
132 			break;
133 		case IORING_RESTRICTION_SQE_OP:
134 			if (res[i].sqe_op >= IORING_OP_LAST)
135 				goto err;
136 			__set_bit(res[i].sqe_op, restrictions->sqe_op);
137 			break;
138 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
139 			restrictions->sqe_flags_allowed = res[i].sqe_flags;
140 			break;
141 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
142 			restrictions->sqe_flags_required = res[i].sqe_flags;
143 			break;
144 		default:
145 			goto err;
146 		}
147 	}
148 
149 	ret = 0;
150 
151 err:
152 	kfree(res);
153 	return ret;
154 }
155 
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)156 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
157 					   void __user *arg, unsigned int nr_args)
158 {
159 	int ret;
160 
161 	/* Restrictions allowed only if rings started disabled */
162 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
163 		return -EBADFD;
164 
165 	/* We allow only a single restrictions registration */
166 	if (ctx->restrictions.registered)
167 		return -EBUSY;
168 
169 	ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
170 	/* Reset all restrictions if an error happened */
171 	if (ret != 0)
172 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
173 	else
174 		ctx->restrictions.registered = true;
175 	return ret;
176 }
177 
io_register_enable_rings(struct io_ring_ctx * ctx)178 static int io_register_enable_rings(struct io_ring_ctx *ctx)
179 {
180 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
181 		return -EBADFD;
182 
183 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
184 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
185 		/*
186 		 * Lazy activation attempts would fail if it was polled before
187 		 * submitter_task is set.
188 		 */
189 		if (wq_has_sleeper(&ctx->poll_wq))
190 			io_activate_pollwq(ctx);
191 	}
192 
193 	if (ctx->restrictions.registered)
194 		ctx->restricted = 1;
195 
196 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
197 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
198 		wake_up(&ctx->sq_data->wait);
199 	return 0;
200 }
201 
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)202 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
203 					 cpumask_var_t new_mask)
204 {
205 	int ret;
206 
207 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
208 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
209 	} else {
210 		mutex_unlock(&ctx->uring_lock);
211 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
212 		mutex_lock(&ctx->uring_lock);
213 	}
214 
215 	return ret;
216 }
217 
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)218 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
219 				       void __user *arg, unsigned len)
220 {
221 	cpumask_var_t new_mask;
222 	int ret;
223 
224 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
225 		return -ENOMEM;
226 
227 	cpumask_clear(new_mask);
228 	if (len > cpumask_size())
229 		len = cpumask_size();
230 
231 #ifdef CONFIG_COMPAT
232 	if (in_compat_syscall())
233 		ret = compat_get_bitmap(cpumask_bits(new_mask),
234 					(const compat_ulong_t __user *)arg,
235 					len * 8 /* CHAR_BIT */);
236 	else
237 #endif
238 		ret = copy_from_user(new_mask, arg, len);
239 
240 	if (ret) {
241 		free_cpumask_var(new_mask);
242 		return -EFAULT;
243 	}
244 
245 	ret = __io_register_iowq_aff(ctx, new_mask);
246 	free_cpumask_var(new_mask);
247 	return ret;
248 }
249 
io_unregister_iowq_aff(struct io_ring_ctx * ctx)250 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
251 {
252 	return __io_register_iowq_aff(ctx, NULL);
253 }
254 
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)255 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
256 					       void __user *arg)
257 	__must_hold(&ctx->uring_lock)
258 {
259 	struct io_tctx_node *node;
260 	struct io_uring_task *tctx = NULL;
261 	struct io_sq_data *sqd = NULL;
262 	__u32 new_count[2];
263 	int i, ret;
264 
265 	if (copy_from_user(new_count, arg, sizeof(new_count)))
266 		return -EFAULT;
267 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
268 		if (new_count[i] > INT_MAX)
269 			return -EINVAL;
270 
271 	if (ctx->flags & IORING_SETUP_SQPOLL) {
272 		sqd = ctx->sq_data;
273 		if (sqd) {
274 			struct task_struct *tsk;
275 
276 			/*
277 			 * Observe the correct sqd->lock -> ctx->uring_lock
278 			 * ordering. Fine to drop uring_lock here, we hold
279 			 * a ref to the ctx.
280 			 */
281 			refcount_inc(&sqd->refs);
282 			mutex_unlock(&ctx->uring_lock);
283 			mutex_lock(&sqd->lock);
284 			mutex_lock(&ctx->uring_lock);
285 			tsk = sqpoll_task_locked(sqd);
286 			if (tsk)
287 				tctx = tsk->io_uring;
288 		}
289 	} else {
290 		tctx = current->io_uring;
291 	}
292 
293 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
294 
295 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
296 		if (new_count[i])
297 			ctx->iowq_limits[i] = new_count[i];
298 	ctx->iowq_limits_set = true;
299 
300 	if (tctx && tctx->io_wq) {
301 		ret = io_wq_max_workers(tctx->io_wq, new_count);
302 		if (ret)
303 			goto err;
304 	} else {
305 		memset(new_count, 0, sizeof(new_count));
306 	}
307 
308 	if (sqd) {
309 		mutex_unlock(&ctx->uring_lock);
310 		mutex_unlock(&sqd->lock);
311 		io_put_sq_data(sqd);
312 		mutex_lock(&ctx->uring_lock);
313 	}
314 
315 	if (copy_to_user(arg, new_count, sizeof(new_count)))
316 		return -EFAULT;
317 
318 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
319 	if (sqd)
320 		return 0;
321 
322 	/* now propagate the restriction to all registered users */
323 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
324 		tctx = node->task->io_uring;
325 		if (WARN_ON_ONCE(!tctx->io_wq))
326 			continue;
327 
328 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
329 			new_count[i] = ctx->iowq_limits[i];
330 		/* ignore errors, it always returns zero anyway */
331 		(void)io_wq_max_workers(tctx->io_wq, new_count);
332 	}
333 	return 0;
334 err:
335 	if (sqd) {
336 		mutex_unlock(&ctx->uring_lock);
337 		mutex_unlock(&sqd->lock);
338 		io_put_sq_data(sqd);
339 		mutex_lock(&ctx->uring_lock);
340 	}
341 	return ret;
342 }
343 
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)344 static int io_register_clock(struct io_ring_ctx *ctx,
345 			     struct io_uring_clock_register __user *arg)
346 {
347 	struct io_uring_clock_register reg;
348 
349 	if (copy_from_user(&reg, arg, sizeof(reg)))
350 		return -EFAULT;
351 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
352 		return -EINVAL;
353 
354 	switch (reg.clockid) {
355 	case CLOCK_MONOTONIC:
356 		ctx->clock_offset = 0;
357 		break;
358 	case CLOCK_BOOTTIME:
359 		ctx->clock_offset = TK_OFFS_BOOT;
360 		break;
361 	default:
362 		return -EINVAL;
363 	}
364 
365 	ctx->clockid = reg.clockid;
366 	return 0;
367 }
368 
369 /*
370  * State to maintain until we can swap. Both new and old state, used for
371  * either mapping or freeing.
372  */
373 struct io_ring_ctx_rings {
374 	struct io_rings *rings;
375 	struct io_uring_sqe *sq_sqes;
376 
377 	struct io_mapped_region sq_region;
378 	struct io_mapped_region ring_region;
379 };
380 
io_register_free_rings(struct io_ring_ctx * ctx,struct io_ring_ctx_rings * r)381 static void io_register_free_rings(struct io_ring_ctx *ctx,
382 				   struct io_ring_ctx_rings *r)
383 {
384 	io_free_region(ctx->user, &r->sq_region);
385 	io_free_region(ctx->user, &r->ring_region);
386 }
387 
388 #define swap_old(ctx, o, n, field)		\
389 	do {					\
390 		(o).field = (ctx)->field;	\
391 		(ctx)->field = (n).field;	\
392 	} while (0)
393 
394 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
395 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
396 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
397 			 IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
398 
io_register_resize_rings(struct io_ring_ctx * ctx,void __user * arg)399 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
400 {
401 	struct io_ctx_config config;
402 	struct io_uring_region_desc rd;
403 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
404 	unsigned i, tail, old_head;
405 	struct io_uring_params *p = &config.p;
406 	struct io_rings_layout *rl = &config.layout;
407 	int ret;
408 
409 	memset(&config, 0, sizeof(config));
410 
411 	/* limited to DEFER_TASKRUN for now */
412 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
413 		return -EINVAL;
414 	if (copy_from_user(p, arg, sizeof(*p)))
415 		return -EFAULT;
416 	if (p->flags & ~RESIZE_FLAGS)
417 		return -EINVAL;
418 
419 	/* properties that are always inherited */
420 	p->flags |= (ctx->flags & COPY_FLAGS);
421 
422 	ret = io_prepare_config(&config);
423 	if (unlikely(ret))
424 		return ret;
425 
426 	memset(&rd, 0, sizeof(rd));
427 	rd.size = PAGE_ALIGN(rl->rings_size);
428 	if (p->flags & IORING_SETUP_NO_MMAP) {
429 		rd.user_addr = p->cq_off.user_addr;
430 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
431 	}
432 	ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
433 	if (ret)
434 		return ret;
435 
436 	n.rings = io_region_get_ptr(&n.ring_region);
437 
438 	/*
439 	 * At this point n.rings is shared with userspace, just like o.rings
440 	 * is as well. While we don't expect userspace to modify it while
441 	 * a resize is in progress, and it's most likely that userspace will
442 	 * shoot itself in the foot if it does, we can't always assume good
443 	 * intent... Use read/write once helpers from here on to indicate the
444 	 * shared nature of it.
445 	 */
446 	WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1);
447 	WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1);
448 	WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries);
449 	WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries);
450 
451 	if (copy_to_user(arg, p, sizeof(*p))) {
452 		io_register_free_rings(ctx, &n);
453 		return -EFAULT;
454 	}
455 
456 	memset(&rd, 0, sizeof(rd));
457 	rd.size = PAGE_ALIGN(rl->sq_size);
458 	if (p->flags & IORING_SETUP_NO_MMAP) {
459 		rd.user_addr = p->sq_off.user_addr;
460 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
461 	}
462 	ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
463 	if (ret) {
464 		io_register_free_rings(ctx, &n);
465 		return ret;
466 	}
467 	n.sq_sqes = io_region_get_ptr(&n.sq_region);
468 
469 	/*
470 	 * If using SQPOLL, park the thread
471 	 */
472 	if (ctx->sq_data) {
473 		mutex_unlock(&ctx->uring_lock);
474 		io_sq_thread_park(ctx->sq_data);
475 		mutex_lock(&ctx->uring_lock);
476 	}
477 
478 	/*
479 	 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
480 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
481 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
482 	 * existing rings beyond this point will fail. Not that it could proceed
483 	 * at this point anyway, as the io_uring mmap side needs go grab the
484 	 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
485 	 * duration of the actual swap.
486 	 */
487 	mutex_lock(&ctx->mmap_lock);
488 	spin_lock(&ctx->completion_lock);
489 	o.rings = ctx->rings;
490 	ctx->rings = NULL;
491 	o.sq_sqes = ctx->sq_sqes;
492 	ctx->sq_sqes = NULL;
493 
494 	/*
495 	 * Now copy SQ and CQ entries, if any. If either of the destination
496 	 * rings can't hold what is already there, then fail the operation.
497 	 */
498 	tail = READ_ONCE(o.rings->sq.tail);
499 	old_head = READ_ONCE(o.rings->sq.head);
500 	if (tail - old_head > p->sq_entries)
501 		goto overflow;
502 	for (i = old_head; i < tail; i++) {
503 		unsigned src_head = i & (ctx->sq_entries - 1);
504 		unsigned dst_head = i & (p->sq_entries - 1);
505 
506 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
507 	}
508 	WRITE_ONCE(n.rings->sq.head, old_head);
509 	WRITE_ONCE(n.rings->sq.tail, tail);
510 
511 	tail = READ_ONCE(o.rings->cq.tail);
512 	old_head = READ_ONCE(o.rings->cq.head);
513 	if (tail - old_head > p->cq_entries) {
514 overflow:
515 		/* restore old rings, and return -EOVERFLOW via cleanup path */
516 		ctx->rings = o.rings;
517 		ctx->sq_sqes = o.sq_sqes;
518 		to_free = &n;
519 		ret = -EOVERFLOW;
520 		goto out;
521 	}
522 	for (i = old_head; i < tail; i++) {
523 		unsigned src_head = i & (ctx->cq_entries - 1);
524 		unsigned dst_head = i & (p->cq_entries - 1);
525 
526 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
527 	}
528 	WRITE_ONCE(n.rings->cq.head, old_head);
529 	WRITE_ONCE(n.rings->cq.tail, tail);
530 	/* invalidate cached cqe refill */
531 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
532 
533 	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
534 	atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
535 	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
536 	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
537 
538 	/* all done, store old pointers and assign new ones */
539 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
540 		ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);
541 
542 	ctx->sq_entries = p->sq_entries;
543 	ctx->cq_entries = p->cq_entries;
544 
545 	ctx->rings = n.rings;
546 	ctx->sq_sqes = n.sq_sqes;
547 	swap_old(ctx, o, n, ring_region);
548 	swap_old(ctx, o, n, sq_region);
549 	to_free = &o;
550 	ret = 0;
551 out:
552 	spin_unlock(&ctx->completion_lock);
553 	mutex_unlock(&ctx->mmap_lock);
554 	io_register_free_rings(ctx, to_free);
555 
556 	if (ctx->sq_data)
557 		io_sq_thread_unpark(ctx->sq_data);
558 
559 	return ret;
560 }
561 
io_register_mem_region(struct io_ring_ctx * ctx,void __user * uarg)562 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
563 {
564 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
565 	struct io_uring_mem_region_reg reg;
566 	struct io_uring_region_desc __user *rd_uptr;
567 	struct io_uring_region_desc rd;
568 	struct io_mapped_region region = {};
569 	int ret;
570 
571 	if (io_region_is_set(&ctx->param_region))
572 		return -EBUSY;
573 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
574 		return -EFAULT;
575 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
576 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
577 		return -EFAULT;
578 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
579 		return -EINVAL;
580 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
581 		return -EINVAL;
582 
583 	/*
584 	 * This ensures there are no waiters. Waiters are unlocked and it's
585 	 * hard to synchronise with them, especially if we need to initialise
586 	 * the region.
587 	 */
588 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
589 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
590 		return -EINVAL;
591 
592 	ret = io_create_region(ctx, &region, &rd, IORING_MAP_OFF_PARAM_REGION);
593 	if (ret)
594 		return ret;
595 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
596 		io_free_region(ctx->user, &region);
597 		return -EFAULT;
598 	}
599 
600 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
601 		ctx->cq_wait_arg = io_region_get_ptr(&region);
602 		ctx->cq_wait_size = rd.size;
603 	}
604 
605 	io_region_publish(ctx, &region, &ctx->param_region);
606 	return 0;
607 }
608 
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)609 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
610 			       void __user *arg, unsigned nr_args)
611 	__releases(ctx->uring_lock)
612 	__acquires(ctx->uring_lock)
613 {
614 	int ret;
615 
616 	/*
617 	 * We don't quiesce the refs for register anymore and so it can't be
618 	 * dying as we're holding a file ref here.
619 	 */
620 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
621 		return -ENXIO;
622 
623 	if (ctx->submitter_task && ctx->submitter_task != current)
624 		return -EEXIST;
625 
626 	if (ctx->restricted) {
627 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
628 		if (!test_bit(opcode, ctx->restrictions.register_op))
629 			return -EACCES;
630 	}
631 
632 	switch (opcode) {
633 	case IORING_REGISTER_BUFFERS:
634 		ret = -EFAULT;
635 		if (!arg)
636 			break;
637 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
638 		break;
639 	case IORING_UNREGISTER_BUFFERS:
640 		ret = -EINVAL;
641 		if (arg || nr_args)
642 			break;
643 		ret = io_sqe_buffers_unregister(ctx);
644 		break;
645 	case IORING_REGISTER_FILES:
646 		ret = -EFAULT;
647 		if (!arg)
648 			break;
649 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
650 		break;
651 	case IORING_UNREGISTER_FILES:
652 		ret = -EINVAL;
653 		if (arg || nr_args)
654 			break;
655 		ret = io_sqe_files_unregister(ctx);
656 		break;
657 	case IORING_REGISTER_FILES_UPDATE:
658 		ret = io_register_files_update(ctx, arg, nr_args);
659 		break;
660 	case IORING_REGISTER_EVENTFD:
661 		ret = -EINVAL;
662 		if (nr_args != 1)
663 			break;
664 		ret = io_eventfd_register(ctx, arg, 0);
665 		break;
666 	case IORING_REGISTER_EVENTFD_ASYNC:
667 		ret = -EINVAL;
668 		if (nr_args != 1)
669 			break;
670 		ret = io_eventfd_register(ctx, arg, 1);
671 		break;
672 	case IORING_UNREGISTER_EVENTFD:
673 		ret = -EINVAL;
674 		if (arg || nr_args)
675 			break;
676 		ret = io_eventfd_unregister(ctx);
677 		break;
678 	case IORING_REGISTER_PROBE:
679 		ret = -EINVAL;
680 		if (!arg || nr_args > 256)
681 			break;
682 		ret = io_probe(ctx, arg, nr_args);
683 		break;
684 	case IORING_REGISTER_PERSONALITY:
685 		ret = -EINVAL;
686 		if (arg || nr_args)
687 			break;
688 		ret = io_register_personality(ctx);
689 		break;
690 	case IORING_UNREGISTER_PERSONALITY:
691 		ret = -EINVAL;
692 		if (arg)
693 			break;
694 		ret = io_unregister_personality(ctx, nr_args);
695 		break;
696 	case IORING_REGISTER_ENABLE_RINGS:
697 		ret = -EINVAL;
698 		if (arg || nr_args)
699 			break;
700 		ret = io_register_enable_rings(ctx);
701 		break;
702 	case IORING_REGISTER_RESTRICTIONS:
703 		ret = io_register_restrictions(ctx, arg, nr_args);
704 		break;
705 	case IORING_REGISTER_FILES2:
706 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
707 		break;
708 	case IORING_REGISTER_FILES_UPDATE2:
709 		ret = io_register_rsrc_update(ctx, arg, nr_args,
710 					      IORING_RSRC_FILE);
711 		break;
712 	case IORING_REGISTER_BUFFERS2:
713 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
714 		break;
715 	case IORING_REGISTER_BUFFERS_UPDATE:
716 		ret = io_register_rsrc_update(ctx, arg, nr_args,
717 					      IORING_RSRC_BUFFER);
718 		break;
719 	case IORING_REGISTER_IOWQ_AFF:
720 		ret = -EINVAL;
721 		if (!arg || !nr_args)
722 			break;
723 		ret = io_register_iowq_aff(ctx, arg, nr_args);
724 		break;
725 	case IORING_UNREGISTER_IOWQ_AFF:
726 		ret = -EINVAL;
727 		if (arg || nr_args)
728 			break;
729 		ret = io_unregister_iowq_aff(ctx);
730 		break;
731 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
732 		ret = -EINVAL;
733 		if (!arg || nr_args != 2)
734 			break;
735 		ret = io_register_iowq_max_workers(ctx, arg);
736 		break;
737 	case IORING_REGISTER_RING_FDS:
738 		ret = io_ringfd_register(ctx, arg, nr_args);
739 		break;
740 	case IORING_UNREGISTER_RING_FDS:
741 		ret = io_ringfd_unregister(ctx, arg, nr_args);
742 		break;
743 	case IORING_REGISTER_PBUF_RING:
744 		ret = -EINVAL;
745 		if (!arg || nr_args != 1)
746 			break;
747 		ret = io_register_pbuf_ring(ctx, arg);
748 		break;
749 	case IORING_UNREGISTER_PBUF_RING:
750 		ret = -EINVAL;
751 		if (!arg || nr_args != 1)
752 			break;
753 		ret = io_unregister_pbuf_ring(ctx, arg);
754 		break;
755 	case IORING_REGISTER_SYNC_CANCEL:
756 		ret = -EINVAL;
757 		if (!arg || nr_args != 1)
758 			break;
759 		ret = io_sync_cancel(ctx, arg);
760 		break;
761 	case IORING_REGISTER_FILE_ALLOC_RANGE:
762 		ret = -EINVAL;
763 		if (!arg || nr_args)
764 			break;
765 		ret = io_register_file_alloc_range(ctx, arg);
766 		break;
767 	case IORING_REGISTER_PBUF_STATUS:
768 		ret = -EINVAL;
769 		if (!arg || nr_args != 1)
770 			break;
771 		ret = io_register_pbuf_status(ctx, arg);
772 		break;
773 	case IORING_REGISTER_NAPI:
774 		ret = -EINVAL;
775 		if (!arg || nr_args != 1)
776 			break;
777 		ret = io_register_napi(ctx, arg);
778 		break;
779 	case IORING_UNREGISTER_NAPI:
780 		ret = -EINVAL;
781 		if (nr_args != 1)
782 			break;
783 		ret = io_unregister_napi(ctx, arg);
784 		break;
785 	case IORING_REGISTER_CLOCK:
786 		ret = -EINVAL;
787 		if (!arg || nr_args)
788 			break;
789 		ret = io_register_clock(ctx, arg);
790 		break;
791 	case IORING_REGISTER_CLONE_BUFFERS:
792 		ret = -EINVAL;
793 		if (!arg || nr_args != 1)
794 			break;
795 		ret = io_register_clone_buffers(ctx, arg);
796 		break;
797 	case IORING_REGISTER_ZCRX_IFQ:
798 		ret = -EINVAL;
799 		if (!arg || nr_args != 1)
800 			break;
801 		ret = io_register_zcrx_ifq(ctx, arg);
802 		break;
803 	case IORING_REGISTER_RESIZE_RINGS:
804 		ret = -EINVAL;
805 		if (!arg || nr_args != 1)
806 			break;
807 		ret = io_register_resize_rings(ctx, arg);
808 		break;
809 	case IORING_REGISTER_MEM_REGION:
810 		ret = -EINVAL;
811 		if (!arg || nr_args != 1)
812 			break;
813 		ret = io_register_mem_region(ctx, arg);
814 		break;
815 	case IORING_REGISTER_QUERY:
816 		ret = io_query(arg, nr_args);
817 		break;
818 	case IORING_REGISTER_ZCRX_CTRL:
819 		ret = io_zcrx_ctrl(ctx, arg, nr_args);
820 		break;
821 	default:
822 		ret = -EINVAL;
823 		break;
824 	}
825 
826 	return ret;
827 }
828 
829 /*
830  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
831  * true, then the registered index is used. Otherwise, the normal fd table.
832  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
833  */
io_uring_register_get_file(unsigned int fd,bool registered)834 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
835 {
836 	struct file *file;
837 
838 	if (registered) {
839 		/*
840 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
841 		 * need only dereference our task private array to find it.
842 		 */
843 		struct io_uring_task *tctx = current->io_uring;
844 
845 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
846 			return ERR_PTR(-EINVAL);
847 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
848 		file = tctx->registered_rings[fd];
849 		if (file)
850 			get_file(file);
851 	} else {
852 		file = fget(fd);
853 	}
854 
855 	if (unlikely(!file))
856 		return ERR_PTR(-EBADF);
857 	if (io_is_uring_fops(file))
858 		return file;
859 	fput(file);
860 	return ERR_PTR(-EOPNOTSUPP);
861 }
862 
io_uring_register_send_msg_ring(void __user * arg,unsigned int nr_args)863 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
864 {
865 	struct io_uring_sqe sqe;
866 
867 	if (!arg || nr_args != 1)
868 		return -EINVAL;
869 	if (copy_from_user(&sqe, arg, sizeof(sqe)))
870 		return -EFAULT;
871 	/* no flags supported */
872 	if (sqe.flags)
873 		return -EINVAL;
874 	if (sqe.opcode != IORING_OP_MSG_RING)
875 		return -EINVAL;
876 
877 	return io_uring_sync_msg_ring(&sqe);
878 }
879 
880 /*
881  * "blind" registration opcodes are ones where there's no ring given, and
882  * hence the source fd must be -1.
883  */
io_uring_register_blind(unsigned int opcode,void __user * arg,unsigned int nr_args)884 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
885 				   unsigned int nr_args)
886 {
887 	switch (opcode) {
888 	case IORING_REGISTER_SEND_MSG_RING:
889 		return io_uring_register_send_msg_ring(arg, nr_args);
890 	case IORING_REGISTER_QUERY:
891 		return io_query(arg, nr_args);
892 	}
893 	return -EINVAL;
894 }
895 
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)896 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
897 		void __user *, arg, unsigned int, nr_args)
898 {
899 	struct io_ring_ctx *ctx;
900 	long ret = -EBADF;
901 	struct file *file;
902 	bool use_registered_ring;
903 
904 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
905 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
906 
907 	if (opcode >= IORING_REGISTER_LAST)
908 		return -EINVAL;
909 
910 	if (fd == -1)
911 		return io_uring_register_blind(opcode, arg, nr_args);
912 
913 	file = io_uring_register_get_file(fd, use_registered_ring);
914 	if (IS_ERR(file))
915 		return PTR_ERR(file);
916 	ctx = file->private_data;
917 
918 	mutex_lock(&ctx->uring_lock);
919 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
920 
921 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
922 				ctx->buf_table.nr, ret);
923 	mutex_unlock(&ctx->uring_lock);
924 
925 	fput(file);
926 	return ret;
927 }
928