xref: /linux/io_uring/register.c (revision c715f13bb30f9f4d1bd8888667ef32e43b6fedc1)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "filetable.h"
22 #include "io_uring.h"
23 #include "opdef.h"
24 #include "tctx.h"
25 #include "rsrc.h"
26 #include "sqpoll.h"
27 #include "register.h"
28 #include "cancel.h"
29 #include "kbuf.h"
30 #include "napi.h"
31 #include "eventfd.h"
32 #include "msg_ring.h"
33 #include "memmap.h"
34 #include "zcrx.h"
35 #include "query.h"
36 #include "bpf_filter.h"
37 
38 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
39 				 IORING_REGISTER_LAST + IORING_OP_LAST)
40 
41 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
42 			   unsigned nr_args)
43 {
44 	struct io_uring_probe *p;
45 	size_t size;
46 	int i, ret;
47 
48 	if (nr_args > IORING_OP_LAST)
49 		nr_args = IORING_OP_LAST;
50 
51 	size = struct_size(p, ops, nr_args);
52 	p = memdup_user(arg, size);
53 	if (IS_ERR(p))
54 		return PTR_ERR(p);
55 	ret = -EINVAL;
56 	if (memchr_inv(p, 0, size))
57 		goto out;
58 
59 	p->last_op = IORING_OP_LAST - 1;
60 
61 	for (i = 0; i < nr_args; i++) {
62 		p->ops[i].op = i;
63 		if (io_uring_op_supported(i))
64 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
65 	}
66 	p->ops_len = i;
67 
68 	ret = 0;
69 	if (copy_to_user(arg, p, size))
70 		ret = -EFAULT;
71 out:
72 	kfree(p);
73 	return ret;
74 }
75 
76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
77 {
78 	const struct cred *creds;
79 
80 	creds = xa_erase(&ctx->personalities, id);
81 	if (creds) {
82 		put_cred(creds);
83 		return 0;
84 	}
85 
86 	return -EINVAL;
87 }
88 
89 
90 static int io_register_personality(struct io_ring_ctx *ctx)
91 {
92 	const struct cred *creds;
93 	u32 id;
94 	int ret;
95 
96 	creds = get_current_cred();
97 
98 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
99 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
100 	if (ret < 0) {
101 		put_cred(creds);
102 		return ret;
103 	}
104 	return id;
105 }
106 
107 /*
108  * Returns number of restrictions parsed and added on success, or < 0 for
109  * an error.
110  */
111 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
112 					struct io_restriction *restrictions)
113 {
114 	struct io_uring_restriction *res;
115 	size_t size;
116 	int i, ret;
117 
118 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
119 		return -EINVAL;
120 
121 	size = array_size(nr_args, sizeof(*res));
122 	if (size == SIZE_MAX)
123 		return -EOVERFLOW;
124 
125 	res = memdup_user(arg, size);
126 	if (IS_ERR(res))
127 		return PTR_ERR(res);
128 
129 	ret = -EINVAL;
130 
131 	for (i = 0; i < nr_args; i++) {
132 		switch (res[i].opcode) {
133 		case IORING_RESTRICTION_REGISTER_OP:
134 			if (res[i].register_op >= IORING_REGISTER_LAST)
135 				goto err;
136 			__set_bit(res[i].register_op, restrictions->register_op);
137 			restrictions->reg_registered = true;
138 			break;
139 		case IORING_RESTRICTION_SQE_OP:
140 			if (res[i].sqe_op >= IORING_OP_LAST)
141 				goto err;
142 			__set_bit(res[i].sqe_op, restrictions->sqe_op);
143 			restrictions->op_registered = true;
144 			break;
145 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
146 			restrictions->sqe_flags_allowed = res[i].sqe_flags;
147 			restrictions->op_registered = true;
148 			break;
149 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
150 			restrictions->sqe_flags_required = res[i].sqe_flags;
151 			restrictions->op_registered = true;
152 			break;
153 		default:
154 			goto err;
155 		}
156 	}
157 	ret = nr_args;
158 	if (!nr_args) {
159 		restrictions->op_registered = true;
160 		restrictions->reg_registered = true;
161 	}
162 err:
163 	kfree(res);
164 	return ret;
165 }
166 
167 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
168 					   void __user *arg, unsigned int nr_args)
169 {
170 	int ret;
171 
172 	/* Restrictions allowed only if rings started disabled */
173 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
174 		return -EBADFD;
175 
176 	/* We allow only a single restrictions registration */
177 	if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered)
178 		return -EBUSY;
179 
180 	ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
181 	/* Reset all restrictions if an error happened */
182 	if (ret < 0) {
183 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
184 		return ret;
185 	}
186 	if (ctx->restrictions.op_registered)
187 		ctx->op_restricted = 1;
188 	if (ctx->restrictions.reg_registered)
189 		ctx->reg_restricted = 1;
190 	return 0;
191 }
192 
193 static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
194 {
195 	struct io_uring_task_restriction __user *ures = arg;
196 	struct io_uring_task_restriction tres;
197 	struct io_restriction *res;
198 	int ret;
199 
200 	/* Disallow if task already has registered restrictions */
201 	if (current->io_uring_restrict)
202 		return -EPERM;
203 	/*
204 	 * Similar to seccomp, disallow setting a filter if task_no_new_privs
205 	 * is false and we're not CAP_SYS_ADMIN.
206 	 */
207 	if (!task_no_new_privs(current) &&
208 	    !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
209 		return -EACCES;
210 	if (nr_args != 1)
211 		return -EINVAL;
212 
213 	if (copy_from_user(&tres, arg, sizeof(tres)))
214 		return -EFAULT;
215 
216 	if (tres.flags)
217 		return -EINVAL;
218 	if (!mem_is_zero(tres.resv, sizeof(tres.resv)))
219 		return -EINVAL;
220 
221 	res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
222 	if (!res)
223 		return -ENOMEM;
224 
225 	ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res);
226 	if (ret < 0) {
227 		kfree(res);
228 		return ret;
229 	}
230 	current->io_uring_restrict = res;
231 	return 0;
232 }
233 
234 static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
235 {
236 	struct io_restriction *res;
237 	int ret;
238 
239 	/*
240 	 * Similar to seccomp, disallow setting a filter if task_no_new_privs
241 	 * is false and we're not CAP_SYS_ADMIN.
242 	 */
243 	if (!task_no_new_privs(current) &&
244 	    !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
245 		return -EACCES;
246 
247 	if (nr_args != 1)
248 		return -EINVAL;
249 
250 	/* If no task restrictions exist, setup a new set */
251 	res = current->io_uring_restrict;
252 	if (!res) {
253 		res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
254 		if (!res)
255 			return -ENOMEM;
256 	}
257 
258 	ret = io_register_bpf_filter(res, arg);
259 	if (ret) {
260 		if (res != current->io_uring_restrict)
261 			kfree(res);
262 		return ret;
263 	}
264 	if (!current->io_uring_restrict)
265 		current->io_uring_restrict = res;
266 	return 0;
267 }
268 
269 static int io_register_enable_rings(struct io_ring_ctx *ctx)
270 {
271 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
272 		return -EBADFD;
273 
274 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) {
275 		ctx->submitter_task = get_task_struct(current);
276 		/*
277 		 * Lazy activation attempts would fail if it was polled before
278 		 * submitter_task is set.
279 		 */
280 		if (wq_has_sleeper(&ctx->poll_wq))
281 			io_activate_pollwq(ctx);
282 	}
283 
284 	/* Keep submitter_task store before clearing IORING_SETUP_R_DISABLED */
285 	smp_store_release(&ctx->flags, ctx->flags & ~IORING_SETUP_R_DISABLED);
286 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
287 		wake_up(&ctx->sq_data->wait);
288 	return 0;
289 }
290 
291 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
292 					 cpumask_var_t new_mask)
293 {
294 	int ret;
295 
296 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
297 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
298 	} else {
299 		mutex_unlock(&ctx->uring_lock);
300 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
301 		mutex_lock(&ctx->uring_lock);
302 	}
303 
304 	return ret;
305 }
306 
307 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
308 				       void __user *arg, unsigned len)
309 {
310 	cpumask_var_t new_mask;
311 	int ret;
312 
313 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
314 		return -ENOMEM;
315 
316 	cpumask_clear(new_mask);
317 	if (len > cpumask_size())
318 		len = cpumask_size();
319 
320 #ifdef CONFIG_COMPAT
321 	if (in_compat_syscall())
322 		ret = compat_get_bitmap(cpumask_bits(new_mask),
323 					(const compat_ulong_t __user *)arg,
324 					len * 8 /* CHAR_BIT */);
325 	else
326 #endif
327 		ret = copy_from_user(new_mask, arg, len);
328 
329 	if (ret) {
330 		free_cpumask_var(new_mask);
331 		return -EFAULT;
332 	}
333 
334 	ret = __io_register_iowq_aff(ctx, new_mask);
335 	free_cpumask_var(new_mask);
336 	return ret;
337 }
338 
339 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
340 {
341 	return __io_register_iowq_aff(ctx, NULL);
342 }
343 
344 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
345 					       void __user *arg)
346 	__must_hold(&ctx->uring_lock)
347 {
348 	struct io_tctx_node *node;
349 	struct io_uring_task *tctx = NULL;
350 	struct io_sq_data *sqd = NULL;
351 	__u32 new_count[2];
352 	int i, ret;
353 
354 	if (copy_from_user(new_count, arg, sizeof(new_count)))
355 		return -EFAULT;
356 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
357 		if (new_count[i] > INT_MAX)
358 			return -EINVAL;
359 
360 	if (ctx->flags & IORING_SETUP_SQPOLL) {
361 		sqd = ctx->sq_data;
362 		if (sqd) {
363 			struct task_struct *tsk;
364 
365 			/*
366 			 * Observe the correct sqd->lock -> ctx->uring_lock
367 			 * ordering. Fine to drop uring_lock here, we hold
368 			 * a ref to the ctx.
369 			 */
370 			refcount_inc(&sqd->refs);
371 			mutex_unlock(&ctx->uring_lock);
372 			mutex_lock(&sqd->lock);
373 			mutex_lock(&ctx->uring_lock);
374 			tsk = sqpoll_task_locked(sqd);
375 			if (tsk)
376 				tctx = tsk->io_uring;
377 		}
378 	} else {
379 		tctx = current->io_uring;
380 	}
381 
382 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
383 
384 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
385 		if (new_count[i])
386 			ctx->iowq_limits[i] = new_count[i];
387 	ctx->iowq_limits_set = true;
388 
389 	if (tctx && tctx->io_wq) {
390 		ret = io_wq_max_workers(tctx->io_wq, new_count);
391 		if (ret)
392 			goto err;
393 	} else {
394 		memset(new_count, 0, sizeof(new_count));
395 	}
396 
397 	if (sqd) {
398 		mutex_unlock(&ctx->uring_lock);
399 		mutex_unlock(&sqd->lock);
400 		io_put_sq_data(sqd);
401 		mutex_lock(&ctx->uring_lock);
402 	}
403 
404 	if (copy_to_user(arg, new_count, sizeof(new_count)))
405 		return -EFAULT;
406 
407 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
408 	if (sqd)
409 		return 0;
410 
411 	/* now propagate the restriction to all registered users */
412 	mutex_lock(&ctx->tctx_lock);
413 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
414 		tctx = node->task->io_uring;
415 		if (WARN_ON_ONCE(!tctx->io_wq))
416 			continue;
417 
418 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
419 			new_count[i] = ctx->iowq_limits[i];
420 		/* ignore errors, it always returns zero anyway */
421 		(void)io_wq_max_workers(tctx->io_wq, new_count);
422 	}
423 	mutex_unlock(&ctx->tctx_lock);
424 	return 0;
425 err:
426 	if (sqd) {
427 		mutex_unlock(&ctx->uring_lock);
428 		mutex_unlock(&sqd->lock);
429 		io_put_sq_data(sqd);
430 		mutex_lock(&ctx->uring_lock);
431 	}
432 	return ret;
433 }
434 
435 static int io_register_clock(struct io_ring_ctx *ctx,
436 			     struct io_uring_clock_register __user *arg)
437 {
438 	struct io_uring_clock_register reg;
439 
440 	if (copy_from_user(&reg, arg, sizeof(reg)))
441 		return -EFAULT;
442 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
443 		return -EINVAL;
444 
445 	switch (reg.clockid) {
446 	case CLOCK_MONOTONIC:
447 		ctx->clock_offset = 0;
448 		break;
449 	case CLOCK_BOOTTIME:
450 		ctx->clock_offset = TK_OFFS_BOOT;
451 		break;
452 	default:
453 		return -EINVAL;
454 	}
455 
456 	ctx->clockid = reg.clockid;
457 	return 0;
458 }
459 
460 /*
461  * State to maintain until we can swap. Both new and old state, used for
462  * either mapping or freeing.
463  */
464 struct io_ring_ctx_rings {
465 	struct io_rings *rings;
466 	struct io_uring_sqe *sq_sqes;
467 
468 	struct io_mapped_region sq_region;
469 	struct io_mapped_region ring_region;
470 };
471 
472 static void io_register_free_rings(struct io_ring_ctx *ctx,
473 				   struct io_ring_ctx_rings *r)
474 {
475 	io_free_region(ctx->user, &r->sq_region);
476 	io_free_region(ctx->user, &r->ring_region);
477 }
478 
479 #define swap_old(ctx, o, n, field)		\
480 	do {					\
481 		(o).field = (ctx)->field;	\
482 		(ctx)->field = (n).field;	\
483 	} while (0)
484 
485 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
486 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
487 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
488 			 IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
489 
490 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
491 {
492 	struct io_ctx_config config;
493 	struct io_uring_region_desc rd;
494 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
495 	unsigned i, tail, old_head;
496 	struct io_uring_params *p = &config.p;
497 	struct io_rings_layout *rl = &config.layout;
498 	int ret;
499 
500 	memset(&config, 0, sizeof(config));
501 
502 	/* limited to DEFER_TASKRUN for now */
503 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
504 		return -EINVAL;
505 	if (copy_from_user(p, arg, sizeof(*p)))
506 		return -EFAULT;
507 	if (p->flags & ~RESIZE_FLAGS)
508 		return -EINVAL;
509 
510 	/* properties that are always inherited */
511 	p->flags |= (ctx->flags & COPY_FLAGS);
512 
513 	ret = io_prepare_config(&config);
514 	if (unlikely(ret))
515 		return ret;
516 
517 	memset(&rd, 0, sizeof(rd));
518 	rd.size = PAGE_ALIGN(rl->rings_size);
519 	if (p->flags & IORING_SETUP_NO_MMAP) {
520 		rd.user_addr = p->cq_off.user_addr;
521 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
522 	}
523 	ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
524 	if (ret)
525 		return ret;
526 
527 	n.rings = io_region_get_ptr(&n.ring_region);
528 
529 	/*
530 	 * At this point n.rings is shared with userspace, just like o.rings
531 	 * is as well. While we don't expect userspace to modify it while
532 	 * a resize is in progress, and it's most likely that userspace will
533 	 * shoot itself in the foot if it does, we can't always assume good
534 	 * intent... Use read/write once helpers from here on to indicate the
535 	 * shared nature of it.
536 	 */
537 	WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1);
538 	WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1);
539 	WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries);
540 	WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries);
541 
542 	if (copy_to_user(arg, p, sizeof(*p))) {
543 		io_register_free_rings(ctx, &n);
544 		return -EFAULT;
545 	}
546 
547 	memset(&rd, 0, sizeof(rd));
548 	rd.size = PAGE_ALIGN(rl->sq_size);
549 	if (p->flags & IORING_SETUP_NO_MMAP) {
550 		rd.user_addr = p->sq_off.user_addr;
551 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
552 	}
553 	ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
554 	if (ret) {
555 		io_register_free_rings(ctx, &n);
556 		return ret;
557 	}
558 	n.sq_sqes = io_region_get_ptr(&n.sq_region);
559 
560 	/*
561 	 * If using SQPOLL, park the thread
562 	 */
563 	if (ctx->sq_data) {
564 		mutex_unlock(&ctx->uring_lock);
565 		io_sq_thread_park(ctx->sq_data);
566 		mutex_lock(&ctx->uring_lock);
567 	}
568 
569 	/*
570 	 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
571 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
572 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
573 	 * existing rings beyond this point will fail. Not that it could proceed
574 	 * at this point anyway, as the io_uring mmap side needs go grab the
575 	 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
576 	 * duration of the actual swap.
577 	 */
578 	mutex_lock(&ctx->mmap_lock);
579 	spin_lock(&ctx->completion_lock);
580 	o.rings = ctx->rings;
581 	ctx->rings = NULL;
582 	o.sq_sqes = ctx->sq_sqes;
583 	ctx->sq_sqes = NULL;
584 
585 	/*
586 	 * Now copy SQ and CQ entries, if any. If either of the destination
587 	 * rings can't hold what is already there, then fail the operation.
588 	 */
589 	tail = READ_ONCE(o.rings->sq.tail);
590 	old_head = READ_ONCE(o.rings->sq.head);
591 	if (tail - old_head > p->sq_entries)
592 		goto overflow;
593 	for (i = old_head; i < tail; i++) {
594 		unsigned src_head = i & (ctx->sq_entries - 1);
595 		unsigned dst_head = i & (p->sq_entries - 1);
596 
597 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
598 	}
599 	WRITE_ONCE(n.rings->sq.head, old_head);
600 	WRITE_ONCE(n.rings->sq.tail, tail);
601 
602 	tail = READ_ONCE(o.rings->cq.tail);
603 	old_head = READ_ONCE(o.rings->cq.head);
604 	if (tail - old_head > p->cq_entries) {
605 overflow:
606 		/* restore old rings, and return -EOVERFLOW via cleanup path */
607 		ctx->rings = o.rings;
608 		ctx->sq_sqes = o.sq_sqes;
609 		to_free = &n;
610 		ret = -EOVERFLOW;
611 		goto out;
612 	}
613 	for (i = old_head; i < tail; i++) {
614 		unsigned src_head = i & (ctx->cq_entries - 1);
615 		unsigned dst_head = i & (p->cq_entries - 1);
616 
617 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
618 	}
619 	WRITE_ONCE(n.rings->cq.head, old_head);
620 	WRITE_ONCE(n.rings->cq.tail, tail);
621 	/* invalidate cached cqe refill */
622 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
623 
624 	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
625 	atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
626 	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
627 	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
628 
629 	/* all done, store old pointers and assign new ones */
630 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
631 		ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);
632 
633 	ctx->sq_entries = p->sq_entries;
634 	ctx->cq_entries = p->cq_entries;
635 
636 	/*
637 	 * Just mark any flag we may have missed and that the application
638 	 * should act on unconditionally. Worst case it'll be an extra
639 	 * syscall.
640 	 */
641 	atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags);
642 	ctx->rings = n.rings;
643 	rcu_assign_pointer(ctx->rings_rcu, n.rings);
644 
645 	ctx->sq_sqes = n.sq_sqes;
646 	swap_old(ctx, o, n, ring_region);
647 	swap_old(ctx, o, n, sq_region);
648 	to_free = &o;
649 	ret = 0;
650 out:
651 	spin_unlock(&ctx->completion_lock);
652 	mutex_unlock(&ctx->mmap_lock);
653 	/* Wait for concurrent io_ctx_mark_taskrun() */
654 	if (to_free == &o)
655 		synchronize_rcu_expedited();
656 	io_register_free_rings(ctx, to_free);
657 
658 	if (ctx->sq_data)
659 		io_sq_thread_unpark(ctx->sq_data);
660 
661 	return ret;
662 }
663 
664 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
665 {
666 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
667 	struct io_uring_mem_region_reg reg;
668 	struct io_uring_region_desc __user *rd_uptr;
669 	struct io_uring_region_desc rd;
670 	struct io_mapped_region region = {};
671 	int ret;
672 
673 	if (io_region_is_set(&ctx->param_region))
674 		return -EBUSY;
675 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
676 		return -EFAULT;
677 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
678 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
679 		return -EFAULT;
680 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
681 		return -EINVAL;
682 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
683 		return -EINVAL;
684 
685 	/*
686 	 * This ensures there are no waiters. Waiters are unlocked and it's
687 	 * hard to synchronise with them, especially if we need to initialise
688 	 * the region.
689 	 */
690 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
691 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
692 		return -EINVAL;
693 
694 	ret = io_create_region(ctx, &region, &rd, IORING_MAP_OFF_PARAM_REGION);
695 	if (ret)
696 		return ret;
697 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
698 		io_free_region(ctx->user, &region);
699 		return -EFAULT;
700 	}
701 
702 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
703 		ctx->cq_wait_arg = io_region_get_ptr(&region);
704 		ctx->cq_wait_size = rd.size;
705 	}
706 
707 	io_region_publish(ctx, &region, &ctx->param_region);
708 	return 0;
709 }
710 
711 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
712 			       void __user *arg, unsigned nr_args)
713 	__releases(ctx->uring_lock)
714 	__acquires(ctx->uring_lock)
715 {
716 	int ret;
717 
718 	/*
719 	 * We don't quiesce the refs for register anymore and so it can't be
720 	 * dying as we're holding a file ref here.
721 	 */
722 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
723 		return -ENXIO;
724 
725 	if (ctx->submitter_task && ctx->submitter_task != current)
726 		return -EEXIST;
727 
728 	if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
729 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
730 		if (!test_bit(opcode, ctx->restrictions.register_op))
731 			return -EACCES;
732 	}
733 
734 	switch (opcode) {
735 	case IORING_REGISTER_BUFFERS:
736 		ret = -EFAULT;
737 		if (!arg)
738 			break;
739 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
740 		break;
741 	case IORING_UNREGISTER_BUFFERS:
742 		ret = -EINVAL;
743 		if (arg || nr_args)
744 			break;
745 		ret = io_sqe_buffers_unregister(ctx);
746 		break;
747 	case IORING_REGISTER_FILES:
748 		ret = -EFAULT;
749 		if (!arg)
750 			break;
751 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
752 		break;
753 	case IORING_UNREGISTER_FILES:
754 		ret = -EINVAL;
755 		if (arg || nr_args)
756 			break;
757 		ret = io_sqe_files_unregister(ctx);
758 		break;
759 	case IORING_REGISTER_FILES_UPDATE:
760 		ret = io_register_files_update(ctx, arg, nr_args);
761 		break;
762 	case IORING_REGISTER_EVENTFD:
763 		ret = -EINVAL;
764 		if (nr_args != 1)
765 			break;
766 		ret = io_eventfd_register(ctx, arg, 0);
767 		break;
768 	case IORING_REGISTER_EVENTFD_ASYNC:
769 		ret = -EINVAL;
770 		if (nr_args != 1)
771 			break;
772 		ret = io_eventfd_register(ctx, arg, 1);
773 		break;
774 	case IORING_UNREGISTER_EVENTFD:
775 		ret = -EINVAL;
776 		if (arg || nr_args)
777 			break;
778 		ret = io_eventfd_unregister(ctx);
779 		break;
780 	case IORING_REGISTER_PROBE:
781 		ret = -EINVAL;
782 		if (!arg || nr_args > 256)
783 			break;
784 		ret = io_probe(ctx, arg, nr_args);
785 		break;
786 	case IORING_REGISTER_PERSONALITY:
787 		ret = -EINVAL;
788 		if (arg || nr_args)
789 			break;
790 		ret = io_register_personality(ctx);
791 		break;
792 	case IORING_UNREGISTER_PERSONALITY:
793 		ret = -EINVAL;
794 		if (arg)
795 			break;
796 		ret = io_unregister_personality(ctx, nr_args);
797 		break;
798 	case IORING_REGISTER_ENABLE_RINGS:
799 		ret = -EINVAL;
800 		if (arg || nr_args)
801 			break;
802 		ret = io_register_enable_rings(ctx);
803 		break;
804 	case IORING_REGISTER_RESTRICTIONS:
805 		ret = io_register_restrictions(ctx, arg, nr_args);
806 		break;
807 	case IORING_REGISTER_FILES2:
808 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
809 		break;
810 	case IORING_REGISTER_FILES_UPDATE2:
811 		ret = io_register_rsrc_update(ctx, arg, nr_args,
812 					      IORING_RSRC_FILE);
813 		break;
814 	case IORING_REGISTER_BUFFERS2:
815 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
816 		break;
817 	case IORING_REGISTER_BUFFERS_UPDATE:
818 		ret = io_register_rsrc_update(ctx, arg, nr_args,
819 					      IORING_RSRC_BUFFER);
820 		break;
821 	case IORING_REGISTER_IOWQ_AFF:
822 		ret = -EINVAL;
823 		if (!arg || !nr_args)
824 			break;
825 		ret = io_register_iowq_aff(ctx, arg, nr_args);
826 		break;
827 	case IORING_UNREGISTER_IOWQ_AFF:
828 		ret = -EINVAL;
829 		if (arg || nr_args)
830 			break;
831 		ret = io_unregister_iowq_aff(ctx);
832 		break;
833 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
834 		ret = -EINVAL;
835 		if (!arg || nr_args != 2)
836 			break;
837 		ret = io_register_iowq_max_workers(ctx, arg);
838 		break;
839 	case IORING_REGISTER_RING_FDS:
840 		ret = io_ringfd_register(ctx, arg, nr_args);
841 		break;
842 	case IORING_UNREGISTER_RING_FDS:
843 		ret = io_ringfd_unregister(ctx, arg, nr_args);
844 		break;
845 	case IORING_REGISTER_PBUF_RING:
846 		ret = -EINVAL;
847 		if (!arg || nr_args != 1)
848 			break;
849 		ret = io_register_pbuf_ring(ctx, arg);
850 		break;
851 	case IORING_UNREGISTER_PBUF_RING:
852 		ret = -EINVAL;
853 		if (!arg || nr_args != 1)
854 			break;
855 		ret = io_unregister_pbuf_ring(ctx, arg);
856 		break;
857 	case IORING_REGISTER_SYNC_CANCEL:
858 		ret = -EINVAL;
859 		if (!arg || nr_args != 1)
860 			break;
861 		ret = io_sync_cancel(ctx, arg);
862 		break;
863 	case IORING_REGISTER_FILE_ALLOC_RANGE:
864 		ret = -EINVAL;
865 		if (!arg || nr_args)
866 			break;
867 		ret = io_register_file_alloc_range(ctx, arg);
868 		break;
869 	case IORING_REGISTER_PBUF_STATUS:
870 		ret = -EINVAL;
871 		if (!arg || nr_args != 1)
872 			break;
873 		ret = io_register_pbuf_status(ctx, arg);
874 		break;
875 	case IORING_REGISTER_NAPI:
876 		ret = -EINVAL;
877 		if (!arg || nr_args != 1)
878 			break;
879 		ret = io_register_napi(ctx, arg);
880 		break;
881 	case IORING_UNREGISTER_NAPI:
882 		ret = -EINVAL;
883 		if (nr_args != 1)
884 			break;
885 		ret = io_unregister_napi(ctx, arg);
886 		break;
887 	case IORING_REGISTER_CLOCK:
888 		ret = -EINVAL;
889 		if (!arg || nr_args)
890 			break;
891 		ret = io_register_clock(ctx, arg);
892 		break;
893 	case IORING_REGISTER_CLONE_BUFFERS:
894 		ret = -EINVAL;
895 		if (!arg || nr_args != 1)
896 			break;
897 		ret = io_register_clone_buffers(ctx, arg);
898 		break;
899 	case IORING_REGISTER_ZCRX_IFQ:
900 		ret = -EINVAL;
901 		if (!arg || nr_args != 1)
902 			break;
903 		ret = io_register_zcrx_ifq(ctx, arg);
904 		break;
905 	case IORING_REGISTER_RESIZE_RINGS:
906 		ret = -EINVAL;
907 		if (!arg || nr_args != 1)
908 			break;
909 		ret = io_register_resize_rings(ctx, arg);
910 		break;
911 	case IORING_REGISTER_MEM_REGION:
912 		ret = -EINVAL;
913 		if (!arg || nr_args != 1)
914 			break;
915 		ret = io_register_mem_region(ctx, arg);
916 		break;
917 	case IORING_REGISTER_QUERY:
918 		ret = io_query(arg, nr_args);
919 		break;
920 	case IORING_REGISTER_ZCRX_CTRL:
921 		ret = io_zcrx_ctrl(ctx, arg, nr_args);
922 		break;
923 	case IORING_REGISTER_BPF_FILTER:
924 		ret = -EINVAL;
925 
926 		if (nr_args != 1)
927 			break;
928 		ret = io_register_bpf_filter(&ctx->restrictions, arg);
929 		if (!ret)
930 			WRITE_ONCE(ctx->bpf_filters,
931 				   ctx->restrictions.bpf_filters->filters);
932 		break;
933 	default:
934 		ret = -EINVAL;
935 		break;
936 	}
937 
938 	return ret;
939 }
940 
941 /*
942  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
943  * true, then the registered index is used. Otherwise, the normal fd table.
944  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
945  */
946 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
947 {
948 	struct file *file;
949 
950 	if (registered) {
951 		/*
952 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
953 		 * need only dereference our task private array to find it.
954 		 */
955 		struct io_uring_task *tctx = current->io_uring;
956 
957 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
958 			return ERR_PTR(-EINVAL);
959 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
960 		file = tctx->registered_rings[fd];
961 		if (file)
962 			get_file(file);
963 	} else {
964 		file = fget(fd);
965 	}
966 
967 	if (unlikely(!file))
968 		return ERR_PTR(-EBADF);
969 	if (io_is_uring_fops(file))
970 		return file;
971 	fput(file);
972 	return ERR_PTR(-EOPNOTSUPP);
973 }
974 
975 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
976 {
977 	struct io_uring_sqe sqe;
978 
979 	if (!arg || nr_args != 1)
980 		return -EINVAL;
981 	if (copy_from_user(&sqe, arg, sizeof(sqe)))
982 		return -EFAULT;
983 	/* no flags supported */
984 	if (sqe.flags)
985 		return -EINVAL;
986 	if (sqe.opcode != IORING_OP_MSG_RING)
987 		return -EINVAL;
988 
989 	return io_uring_sync_msg_ring(&sqe);
990 }
991 
992 /*
993  * "blind" registration opcodes are ones where there's no ring given, and
994  * hence the source fd must be -1.
995  */
996 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
997 				   unsigned int nr_args)
998 {
999 	switch (opcode) {
1000 	case IORING_REGISTER_SEND_MSG_RING:
1001 		return io_uring_register_send_msg_ring(arg, nr_args);
1002 	case IORING_REGISTER_QUERY:
1003 		return io_query(arg, nr_args);
1004 	case IORING_REGISTER_RESTRICTIONS:
1005 		return io_register_restrictions_task(arg, nr_args);
1006 	case IORING_REGISTER_BPF_FILTER:
1007 		return io_register_bpf_filter_task(arg, nr_args);
1008 	}
1009 	return -EINVAL;
1010 }
1011 
1012 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
1013 		void __user *, arg, unsigned int, nr_args)
1014 {
1015 	struct io_ring_ctx *ctx;
1016 	long ret = -EBADF;
1017 	struct file *file;
1018 	bool use_registered_ring;
1019 
1020 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
1021 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
1022 
1023 	if (opcode >= IORING_REGISTER_LAST)
1024 		return -EINVAL;
1025 
1026 	if (fd == -1)
1027 		return io_uring_register_blind(opcode, arg, nr_args);
1028 
1029 	file = io_uring_register_get_file(fd, use_registered_ring);
1030 	if (IS_ERR(file))
1031 		return PTR_ERR(file);
1032 	ctx = file->private_data;
1033 
1034 	mutex_lock(&ctx->uring_lock);
1035 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
1036 
1037 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
1038 				ctx->buf_table.nr, ret);
1039 	mutex_unlock(&ctx->uring_lock);
1040 
1041 	fput(file);
1042 	return ret;
1043 }
1044