xref: /linux/io_uring/register.c (revision 9dc520632a0dd3bdc37540528040771a96bdc8ff)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "filetable.h"
22 #include "io_uring.h"
23 #include "opdef.h"
24 #include "tctx.h"
25 #include "rsrc.h"
26 #include "sqpoll.h"
27 #include "register.h"
28 #include "cancel.h"
29 #include "kbuf.h"
30 #include "napi.h"
31 #include "eventfd.h"
32 #include "msg_ring.h"
33 #include "memmap.h"
34 #include "zcrx.h"
35 #include "query.h"
36 
37 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
38 				 IORING_REGISTER_LAST + IORING_OP_LAST)
39 
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)40 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
41 			   unsigned nr_args)
42 {
43 	struct io_uring_probe *p;
44 	size_t size;
45 	int i, ret;
46 
47 	if (nr_args > IORING_OP_LAST)
48 		nr_args = IORING_OP_LAST;
49 
50 	size = struct_size(p, ops, nr_args);
51 	p = memdup_user(arg, size);
52 	if (IS_ERR(p))
53 		return PTR_ERR(p);
54 	ret = -EINVAL;
55 	if (memchr_inv(p, 0, size))
56 		goto out;
57 
58 	p->last_op = IORING_OP_LAST - 1;
59 
60 	for (i = 0; i < nr_args; i++) {
61 		p->ops[i].op = i;
62 		if (io_uring_op_supported(i))
63 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
64 	}
65 	p->ops_len = i;
66 
67 	ret = 0;
68 	if (copy_to_user(arg, p, size))
69 		ret = -EFAULT;
70 out:
71 	kfree(p);
72 	return ret;
73 }
74 
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)75 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
76 {
77 	const struct cred *creds;
78 
79 	creds = xa_erase(&ctx->personalities, id);
80 	if (creds) {
81 		put_cred(creds);
82 		return 0;
83 	}
84 
85 	return -EINVAL;
86 }
87 
88 
io_register_personality(struct io_ring_ctx * ctx)89 static int io_register_personality(struct io_ring_ctx *ctx)
90 {
91 	const struct cred *creds;
92 	u32 id;
93 	int ret;
94 
95 	creds = get_current_cred();
96 
97 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
98 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
99 	if (ret < 0) {
100 		put_cred(creds);
101 		return ret;
102 	}
103 	return id;
104 }
105 
io_parse_restrictions(void __user * arg,unsigned int nr_args,struct io_restriction * restrictions)106 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
107 					struct io_restriction *restrictions)
108 {
109 	struct io_uring_restriction *res;
110 	size_t size;
111 	int i, ret;
112 
113 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
114 		return -EINVAL;
115 
116 	size = array_size(nr_args, sizeof(*res));
117 	if (size == SIZE_MAX)
118 		return -EOVERFLOW;
119 
120 	res = memdup_user(arg, size);
121 	if (IS_ERR(res))
122 		return PTR_ERR(res);
123 
124 	ret = -EINVAL;
125 
126 	for (i = 0; i < nr_args; i++) {
127 		switch (res[i].opcode) {
128 		case IORING_RESTRICTION_REGISTER_OP:
129 			if (res[i].register_op >= IORING_REGISTER_LAST)
130 				goto err;
131 			__set_bit(res[i].register_op, restrictions->register_op);
132 			break;
133 		case IORING_RESTRICTION_SQE_OP:
134 			if (res[i].sqe_op >= IORING_OP_LAST)
135 				goto err;
136 			__set_bit(res[i].sqe_op, restrictions->sqe_op);
137 			break;
138 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
139 			restrictions->sqe_flags_allowed = res[i].sqe_flags;
140 			break;
141 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
142 			restrictions->sqe_flags_required = res[i].sqe_flags;
143 			break;
144 		default:
145 			goto err;
146 		}
147 	}
148 
149 	ret = 0;
150 
151 err:
152 	kfree(res);
153 	return ret;
154 }
155 
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)156 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
157 					   void __user *arg, unsigned int nr_args)
158 {
159 	int ret;
160 
161 	/* Restrictions allowed only if rings started disabled */
162 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
163 		return -EBADFD;
164 
165 	/* We allow only a single restrictions registration */
166 	if (ctx->restrictions.registered)
167 		return -EBUSY;
168 
169 	ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
170 	/* Reset all restrictions if an error happened */
171 	if (ret != 0)
172 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
173 	else
174 		ctx->restrictions.registered = true;
175 	return ret;
176 }
177 
io_register_enable_rings(struct io_ring_ctx * ctx)178 static int io_register_enable_rings(struct io_ring_ctx *ctx)
179 {
180 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
181 		return -EBADFD;
182 
183 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
184 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
185 		/*
186 		 * Lazy activation attempts would fail if it was polled before
187 		 * submitter_task is set.
188 		 */
189 		if (wq_has_sleeper(&ctx->poll_wq))
190 			io_activate_pollwq(ctx);
191 	}
192 
193 	if (ctx->restrictions.registered)
194 		ctx->restricted = 1;
195 
196 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
197 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
198 		wake_up(&ctx->sq_data->wait);
199 	return 0;
200 }
201 
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)202 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
203 					 cpumask_var_t new_mask)
204 {
205 	int ret;
206 
207 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
208 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
209 	} else {
210 		mutex_unlock(&ctx->uring_lock);
211 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
212 		mutex_lock(&ctx->uring_lock);
213 	}
214 
215 	return ret;
216 }
217 
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)218 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
219 				       void __user *arg, unsigned len)
220 {
221 	cpumask_var_t new_mask;
222 	int ret;
223 
224 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
225 		return -ENOMEM;
226 
227 	cpumask_clear(new_mask);
228 	if (len > cpumask_size())
229 		len = cpumask_size();
230 
231 #ifdef CONFIG_COMPAT
232 	if (in_compat_syscall())
233 		ret = compat_get_bitmap(cpumask_bits(new_mask),
234 					(const compat_ulong_t __user *)arg,
235 					len * 8 /* CHAR_BIT */);
236 	else
237 #endif
238 		ret = copy_from_user(new_mask, arg, len);
239 
240 	if (ret) {
241 		free_cpumask_var(new_mask);
242 		return -EFAULT;
243 	}
244 
245 	ret = __io_register_iowq_aff(ctx, new_mask);
246 	free_cpumask_var(new_mask);
247 	return ret;
248 }
249 
io_unregister_iowq_aff(struct io_ring_ctx * ctx)250 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
251 {
252 	return __io_register_iowq_aff(ctx, NULL);
253 }
254 
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)255 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
256 					       void __user *arg)
257 	__must_hold(&ctx->uring_lock)
258 {
259 	struct io_tctx_node *node;
260 	struct io_uring_task *tctx = NULL;
261 	struct io_sq_data *sqd = NULL;
262 	__u32 new_count[2];
263 	int i, ret;
264 
265 	if (copy_from_user(new_count, arg, sizeof(new_count)))
266 		return -EFAULT;
267 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
268 		if (new_count[i] > INT_MAX)
269 			return -EINVAL;
270 
271 	if (ctx->flags & IORING_SETUP_SQPOLL) {
272 		sqd = ctx->sq_data;
273 		if (sqd) {
274 			struct task_struct *tsk;
275 
276 			/*
277 			 * Observe the correct sqd->lock -> ctx->uring_lock
278 			 * ordering. Fine to drop uring_lock here, we hold
279 			 * a ref to the ctx.
280 			 */
281 			refcount_inc(&sqd->refs);
282 			mutex_unlock(&ctx->uring_lock);
283 			mutex_lock(&sqd->lock);
284 			mutex_lock(&ctx->uring_lock);
285 			tsk = sqpoll_task_locked(sqd);
286 			if (tsk)
287 				tctx = tsk->io_uring;
288 		}
289 	} else {
290 		tctx = current->io_uring;
291 	}
292 
293 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
294 
295 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
296 		if (new_count[i])
297 			ctx->iowq_limits[i] = new_count[i];
298 	ctx->iowq_limits_set = true;
299 
300 	if (tctx && tctx->io_wq) {
301 		ret = io_wq_max_workers(tctx->io_wq, new_count);
302 		if (ret)
303 			goto err;
304 	} else {
305 		memset(new_count, 0, sizeof(new_count));
306 	}
307 
308 	if (sqd) {
309 		mutex_unlock(&ctx->uring_lock);
310 		mutex_unlock(&sqd->lock);
311 		io_put_sq_data(sqd);
312 		mutex_lock(&ctx->uring_lock);
313 	}
314 
315 	if (copy_to_user(arg, new_count, sizeof(new_count)))
316 		return -EFAULT;
317 
318 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
319 	if (sqd)
320 		return 0;
321 
322 	/* now propagate the restriction to all registered users */
323 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
324 		tctx = node->task->io_uring;
325 		if (WARN_ON_ONCE(!tctx->io_wq))
326 			continue;
327 
328 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
329 			new_count[i] = ctx->iowq_limits[i];
330 		/* ignore errors, it always returns zero anyway */
331 		(void)io_wq_max_workers(tctx->io_wq, new_count);
332 	}
333 	return 0;
334 err:
335 	if (sqd) {
336 		mutex_unlock(&ctx->uring_lock);
337 		mutex_unlock(&sqd->lock);
338 		io_put_sq_data(sqd);
339 		mutex_lock(&ctx->uring_lock);
340 	}
341 	return ret;
342 }
343 
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)344 static int io_register_clock(struct io_ring_ctx *ctx,
345 			     struct io_uring_clock_register __user *arg)
346 {
347 	struct io_uring_clock_register reg;
348 
349 	if (copy_from_user(&reg, arg, sizeof(reg)))
350 		return -EFAULT;
351 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
352 		return -EINVAL;
353 
354 	switch (reg.clockid) {
355 	case CLOCK_MONOTONIC:
356 		ctx->clock_offset = 0;
357 		break;
358 	case CLOCK_BOOTTIME:
359 		ctx->clock_offset = TK_OFFS_BOOT;
360 		break;
361 	default:
362 		return -EINVAL;
363 	}
364 
365 	ctx->clockid = reg.clockid;
366 	return 0;
367 }
368 
369 /*
370  * State to maintain until we can swap. Both new and old state, used for
371  * either mapping or freeing.
372  */
373 struct io_ring_ctx_rings {
374 	struct io_rings *rings;
375 	struct io_uring_sqe *sq_sqes;
376 
377 	struct io_mapped_region sq_region;
378 	struct io_mapped_region ring_region;
379 };
380 
io_register_free_rings(struct io_ring_ctx * ctx,struct io_uring_params * p,struct io_ring_ctx_rings * r)381 static void io_register_free_rings(struct io_ring_ctx *ctx,
382 				   struct io_uring_params *p,
383 				   struct io_ring_ctx_rings *r)
384 {
385 	io_free_region(ctx, &r->sq_region);
386 	io_free_region(ctx, &r->ring_region);
387 }
388 
389 #define swap_old(ctx, o, n, field)		\
390 	do {					\
391 		(o).field = (ctx)->field;	\
392 		(ctx)->field = (n).field;	\
393 	} while (0)
394 
395 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
396 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
397 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
398 			 IORING_SETUP_CQE_MIXED)
399 
io_register_resize_rings(struct io_ring_ctx * ctx,void __user * arg)400 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
401 {
402 	struct io_uring_region_desc rd;
403 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
404 	size_t size, sq_array_offset;
405 	unsigned i, tail, old_head;
406 	struct io_uring_params p;
407 	int ret;
408 
409 	/* limited to DEFER_TASKRUN for now */
410 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
411 		return -EINVAL;
412 	if (copy_from_user(&p, arg, sizeof(p)))
413 		return -EFAULT;
414 	if (p.flags & ~RESIZE_FLAGS)
415 		return -EINVAL;
416 
417 	/* properties that are always inherited */
418 	p.flags |= (ctx->flags & COPY_FLAGS);
419 
420 	ret = io_uring_fill_params(p.sq_entries, &p);
421 	if (unlikely(ret))
422 		return ret;
423 
424 	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
425 				&sq_array_offset);
426 	if (size == SIZE_MAX)
427 		return -EOVERFLOW;
428 
429 	memset(&rd, 0, sizeof(rd));
430 	rd.size = PAGE_ALIGN(size);
431 	if (p.flags & IORING_SETUP_NO_MMAP) {
432 		rd.user_addr = p.cq_off.user_addr;
433 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
434 	}
435 	ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
436 	if (ret) {
437 		io_register_free_rings(ctx, &p, &n);
438 		return ret;
439 	}
440 	n.rings = io_region_get_ptr(&n.ring_region);
441 
442 	/*
443 	 * At this point n.rings is shared with userspace, just like o.rings
444 	 * is as well. While we don't expect userspace to modify it while
445 	 * a resize is in progress, and it's most likely that userspace will
446 	 * shoot itself in the foot if it does, we can't always assume good
447 	 * intent... Use read/write once helpers from here on to indicate the
448 	 * shared nature of it.
449 	 */
450 	WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
451 	WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
452 	WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
453 	WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
454 
455 	if (copy_to_user(arg, &p, sizeof(p))) {
456 		io_register_free_rings(ctx, &p, &n);
457 		return -EFAULT;
458 	}
459 
460 	if (p.flags & IORING_SETUP_SQE128)
461 		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
462 	else
463 		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
464 	if (size == SIZE_MAX) {
465 		io_register_free_rings(ctx, &p, &n);
466 		return -EOVERFLOW;
467 	}
468 
469 	memset(&rd, 0, sizeof(rd));
470 	rd.size = PAGE_ALIGN(size);
471 	if (p.flags & IORING_SETUP_NO_MMAP) {
472 		rd.user_addr = p.sq_off.user_addr;
473 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
474 	}
475 	ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
476 	if (ret) {
477 		io_register_free_rings(ctx, &p, &n);
478 		return ret;
479 	}
480 	n.sq_sqes = io_region_get_ptr(&n.sq_region);
481 
482 	/*
483 	 * If using SQPOLL, park the thread
484 	 */
485 	if (ctx->sq_data) {
486 		mutex_unlock(&ctx->uring_lock);
487 		io_sq_thread_park(ctx->sq_data);
488 		mutex_lock(&ctx->uring_lock);
489 	}
490 
491 	/*
492 	 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
493 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
494 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
495 	 * existing rings beyond this point will fail. Not that it could proceed
496 	 * at this point anyway, as the io_uring mmap side needs go grab the
497 	 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
498 	 * duration of the actual swap.
499 	 */
500 	mutex_lock(&ctx->mmap_lock);
501 	spin_lock(&ctx->completion_lock);
502 	o.rings = ctx->rings;
503 	ctx->rings = NULL;
504 	o.sq_sqes = ctx->sq_sqes;
505 	ctx->sq_sqes = NULL;
506 
507 	/*
508 	 * Now copy SQ and CQ entries, if any. If either of the destination
509 	 * rings can't hold what is already there, then fail the operation.
510 	 */
511 	tail = READ_ONCE(o.rings->sq.tail);
512 	old_head = READ_ONCE(o.rings->sq.head);
513 	if (tail - old_head > p.sq_entries)
514 		goto overflow;
515 	for (i = old_head; i < tail; i++) {
516 		unsigned src_head = i & (ctx->sq_entries - 1);
517 		unsigned dst_head = i & (p.sq_entries - 1);
518 
519 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
520 	}
521 	WRITE_ONCE(n.rings->sq.head, old_head);
522 	WRITE_ONCE(n.rings->sq.tail, tail);
523 
524 	tail = READ_ONCE(o.rings->cq.tail);
525 	old_head = READ_ONCE(o.rings->cq.head);
526 	if (tail - old_head > p.cq_entries) {
527 overflow:
528 		/* restore old rings, and return -EOVERFLOW via cleanup path */
529 		ctx->rings = o.rings;
530 		ctx->sq_sqes = o.sq_sqes;
531 		to_free = &n;
532 		ret = -EOVERFLOW;
533 		goto out;
534 	}
535 	for (i = old_head; i < tail; i++) {
536 		unsigned src_head = i & (ctx->cq_entries - 1);
537 		unsigned dst_head = i & (p.cq_entries - 1);
538 
539 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
540 	}
541 	WRITE_ONCE(n.rings->cq.head, old_head);
542 	WRITE_ONCE(n.rings->cq.tail, tail);
543 	/* invalidate cached cqe refill */
544 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
545 
546 	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
547 	atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
548 	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
549 	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
550 
551 	/* all done, store old pointers and assign new ones */
552 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
553 		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
554 
555 	ctx->sq_entries = p.sq_entries;
556 	ctx->cq_entries = p.cq_entries;
557 
558 	ctx->rings = n.rings;
559 	ctx->sq_sqes = n.sq_sqes;
560 	swap_old(ctx, o, n, ring_region);
561 	swap_old(ctx, o, n, sq_region);
562 	to_free = &o;
563 	ret = 0;
564 out:
565 	spin_unlock(&ctx->completion_lock);
566 	mutex_unlock(&ctx->mmap_lock);
567 	io_register_free_rings(ctx, &p, to_free);
568 
569 	if (ctx->sq_data)
570 		io_sq_thread_unpark(ctx->sq_data);
571 
572 	return ret;
573 }
574 
io_register_mem_region(struct io_ring_ctx * ctx,void __user * uarg)575 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
576 {
577 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
578 	struct io_uring_mem_region_reg reg;
579 	struct io_uring_region_desc __user *rd_uptr;
580 	struct io_uring_region_desc rd;
581 	int ret;
582 
583 	if (io_region_is_set(&ctx->param_region))
584 		return -EBUSY;
585 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
586 		return -EFAULT;
587 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
588 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
589 		return -EFAULT;
590 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
591 		return -EINVAL;
592 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
593 		return -EINVAL;
594 
595 	/*
596 	 * This ensures there are no waiters. Waiters are unlocked and it's
597 	 * hard to synchronise with them, especially if we need to initialise
598 	 * the region.
599 	 */
600 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
601 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
602 		return -EINVAL;
603 
604 	ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd,
605 					 IORING_MAP_OFF_PARAM_REGION);
606 	if (ret)
607 		return ret;
608 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
609 		guard(mutex)(&ctx->mmap_lock);
610 		io_free_region(ctx, &ctx->param_region);
611 		return -EFAULT;
612 	}
613 
614 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
615 		ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
616 		ctx->cq_wait_size = rd.size;
617 	}
618 	return 0;
619 }
620 
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)621 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
622 			       void __user *arg, unsigned nr_args)
623 	__releases(ctx->uring_lock)
624 	__acquires(ctx->uring_lock)
625 {
626 	int ret;
627 
628 	/*
629 	 * We don't quiesce the refs for register anymore and so it can't be
630 	 * dying as we're holding a file ref here.
631 	 */
632 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
633 		return -ENXIO;
634 
635 	if (ctx->submitter_task && ctx->submitter_task != current)
636 		return -EEXIST;
637 
638 	if (ctx->restricted) {
639 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
640 		if (!test_bit(opcode, ctx->restrictions.register_op))
641 			return -EACCES;
642 	}
643 
644 	switch (opcode) {
645 	case IORING_REGISTER_BUFFERS:
646 		ret = -EFAULT;
647 		if (!arg)
648 			break;
649 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
650 		break;
651 	case IORING_UNREGISTER_BUFFERS:
652 		ret = -EINVAL;
653 		if (arg || nr_args)
654 			break;
655 		ret = io_sqe_buffers_unregister(ctx);
656 		break;
657 	case IORING_REGISTER_FILES:
658 		ret = -EFAULT;
659 		if (!arg)
660 			break;
661 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
662 		break;
663 	case IORING_UNREGISTER_FILES:
664 		ret = -EINVAL;
665 		if (arg || nr_args)
666 			break;
667 		ret = io_sqe_files_unregister(ctx);
668 		break;
669 	case IORING_REGISTER_FILES_UPDATE:
670 		ret = io_register_files_update(ctx, arg, nr_args);
671 		break;
672 	case IORING_REGISTER_EVENTFD:
673 		ret = -EINVAL;
674 		if (nr_args != 1)
675 			break;
676 		ret = io_eventfd_register(ctx, arg, 0);
677 		break;
678 	case IORING_REGISTER_EVENTFD_ASYNC:
679 		ret = -EINVAL;
680 		if (nr_args != 1)
681 			break;
682 		ret = io_eventfd_register(ctx, arg, 1);
683 		break;
684 	case IORING_UNREGISTER_EVENTFD:
685 		ret = -EINVAL;
686 		if (arg || nr_args)
687 			break;
688 		ret = io_eventfd_unregister(ctx);
689 		break;
690 	case IORING_REGISTER_PROBE:
691 		ret = -EINVAL;
692 		if (!arg || nr_args > 256)
693 			break;
694 		ret = io_probe(ctx, arg, nr_args);
695 		break;
696 	case IORING_REGISTER_PERSONALITY:
697 		ret = -EINVAL;
698 		if (arg || nr_args)
699 			break;
700 		ret = io_register_personality(ctx);
701 		break;
702 	case IORING_UNREGISTER_PERSONALITY:
703 		ret = -EINVAL;
704 		if (arg)
705 			break;
706 		ret = io_unregister_personality(ctx, nr_args);
707 		break;
708 	case IORING_REGISTER_ENABLE_RINGS:
709 		ret = -EINVAL;
710 		if (arg || nr_args)
711 			break;
712 		ret = io_register_enable_rings(ctx);
713 		break;
714 	case IORING_REGISTER_RESTRICTIONS:
715 		ret = io_register_restrictions(ctx, arg, nr_args);
716 		break;
717 	case IORING_REGISTER_FILES2:
718 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
719 		break;
720 	case IORING_REGISTER_FILES_UPDATE2:
721 		ret = io_register_rsrc_update(ctx, arg, nr_args,
722 					      IORING_RSRC_FILE);
723 		break;
724 	case IORING_REGISTER_BUFFERS2:
725 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
726 		break;
727 	case IORING_REGISTER_BUFFERS_UPDATE:
728 		ret = io_register_rsrc_update(ctx, arg, nr_args,
729 					      IORING_RSRC_BUFFER);
730 		break;
731 	case IORING_REGISTER_IOWQ_AFF:
732 		ret = -EINVAL;
733 		if (!arg || !nr_args)
734 			break;
735 		ret = io_register_iowq_aff(ctx, arg, nr_args);
736 		break;
737 	case IORING_UNREGISTER_IOWQ_AFF:
738 		ret = -EINVAL;
739 		if (arg || nr_args)
740 			break;
741 		ret = io_unregister_iowq_aff(ctx);
742 		break;
743 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
744 		ret = -EINVAL;
745 		if (!arg || nr_args != 2)
746 			break;
747 		ret = io_register_iowq_max_workers(ctx, arg);
748 		break;
749 	case IORING_REGISTER_RING_FDS:
750 		ret = io_ringfd_register(ctx, arg, nr_args);
751 		break;
752 	case IORING_UNREGISTER_RING_FDS:
753 		ret = io_ringfd_unregister(ctx, arg, nr_args);
754 		break;
755 	case IORING_REGISTER_PBUF_RING:
756 		ret = -EINVAL;
757 		if (!arg || nr_args != 1)
758 			break;
759 		ret = io_register_pbuf_ring(ctx, arg);
760 		break;
761 	case IORING_UNREGISTER_PBUF_RING:
762 		ret = -EINVAL;
763 		if (!arg || nr_args != 1)
764 			break;
765 		ret = io_unregister_pbuf_ring(ctx, arg);
766 		break;
767 	case IORING_REGISTER_SYNC_CANCEL:
768 		ret = -EINVAL;
769 		if (!arg || nr_args != 1)
770 			break;
771 		ret = io_sync_cancel(ctx, arg);
772 		break;
773 	case IORING_REGISTER_FILE_ALLOC_RANGE:
774 		ret = -EINVAL;
775 		if (!arg || nr_args)
776 			break;
777 		ret = io_register_file_alloc_range(ctx, arg);
778 		break;
779 	case IORING_REGISTER_PBUF_STATUS:
780 		ret = -EINVAL;
781 		if (!arg || nr_args != 1)
782 			break;
783 		ret = io_register_pbuf_status(ctx, arg);
784 		break;
785 	case IORING_REGISTER_NAPI:
786 		ret = -EINVAL;
787 		if (!arg || nr_args != 1)
788 			break;
789 		ret = io_register_napi(ctx, arg);
790 		break;
791 	case IORING_UNREGISTER_NAPI:
792 		ret = -EINVAL;
793 		if (nr_args != 1)
794 			break;
795 		ret = io_unregister_napi(ctx, arg);
796 		break;
797 	case IORING_REGISTER_CLOCK:
798 		ret = -EINVAL;
799 		if (!arg || nr_args)
800 			break;
801 		ret = io_register_clock(ctx, arg);
802 		break;
803 	case IORING_REGISTER_CLONE_BUFFERS:
804 		ret = -EINVAL;
805 		if (!arg || nr_args != 1)
806 			break;
807 		ret = io_register_clone_buffers(ctx, arg);
808 		break;
809 	case IORING_REGISTER_ZCRX_IFQ:
810 		ret = -EINVAL;
811 		if (!arg || nr_args != 1)
812 			break;
813 		ret = io_register_zcrx_ifq(ctx, arg);
814 		break;
815 	case IORING_REGISTER_RESIZE_RINGS:
816 		ret = -EINVAL;
817 		if (!arg || nr_args != 1)
818 			break;
819 		ret = io_register_resize_rings(ctx, arg);
820 		break;
821 	case IORING_REGISTER_MEM_REGION:
822 		ret = -EINVAL;
823 		if (!arg || nr_args != 1)
824 			break;
825 		ret = io_register_mem_region(ctx, arg);
826 		break;
827 	case IORING_REGISTER_QUERY:
828 		ret = io_query(ctx, arg, nr_args);
829 		break;
830 	default:
831 		ret = -EINVAL;
832 		break;
833 	}
834 
835 	return ret;
836 }
837 
838 /*
839  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
840  * true, then the registered index is used. Otherwise, the normal fd table.
841  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
842  */
io_uring_register_get_file(unsigned int fd,bool registered)843 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
844 {
845 	struct file *file;
846 
847 	if (registered) {
848 		/*
849 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
850 		 * need only dereference our task private array to find it.
851 		 */
852 		struct io_uring_task *tctx = current->io_uring;
853 
854 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
855 			return ERR_PTR(-EINVAL);
856 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
857 		file = tctx->registered_rings[fd];
858 		if (file)
859 			get_file(file);
860 	} else {
861 		file = fget(fd);
862 	}
863 
864 	if (unlikely(!file))
865 		return ERR_PTR(-EBADF);
866 	if (io_is_uring_fops(file))
867 		return file;
868 	fput(file);
869 	return ERR_PTR(-EOPNOTSUPP);
870 }
871 
io_uring_register_send_msg_ring(void __user * arg,unsigned int nr_args)872 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
873 {
874 	struct io_uring_sqe sqe;
875 
876 	if (!arg || nr_args != 1)
877 		return -EINVAL;
878 	if (copy_from_user(&sqe, arg, sizeof(sqe)))
879 		return -EFAULT;
880 	/* no flags supported */
881 	if (sqe.flags)
882 		return -EINVAL;
883 	if (sqe.opcode != IORING_OP_MSG_RING)
884 		return -EINVAL;
885 
886 	return io_uring_sync_msg_ring(&sqe);
887 }
888 
889 /*
890  * "blind" registration opcodes are ones where there's no ring given, and
891  * hence the source fd must be -1.
892  */
io_uring_register_blind(unsigned int opcode,void __user * arg,unsigned int nr_args)893 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
894 				   unsigned int nr_args)
895 {
896 	switch (opcode) {
897 	case IORING_REGISTER_SEND_MSG_RING:
898 		return io_uring_register_send_msg_ring(arg, nr_args);
899 	case IORING_REGISTER_QUERY:
900 		return io_query(NULL, arg, nr_args);
901 	}
902 	return -EINVAL;
903 }
904 
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)905 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
906 		void __user *, arg, unsigned int, nr_args)
907 {
908 	struct io_ring_ctx *ctx;
909 	long ret = -EBADF;
910 	struct file *file;
911 	bool use_registered_ring;
912 
913 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
914 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
915 
916 	if (opcode >= IORING_REGISTER_LAST)
917 		return -EINVAL;
918 
919 	if (fd == -1)
920 		return io_uring_register_blind(opcode, arg, nr_args);
921 
922 	file = io_uring_register_get_file(fd, use_registered_ring);
923 	if (IS_ERR(file))
924 		return PTR_ERR(file);
925 	ctx = file->private_data;
926 
927 	mutex_lock(&ctx->uring_lock);
928 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
929 
930 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
931 				ctx->buf_table.nr, ret);
932 	mutex_unlock(&ctx->uring_lock);
933 
934 	fput(file);
935 	return ret;
936 }
937