xref: /linux/io_uring/register.c (revision e41255ce7acc4a3412ecdaa74b32deee980d27f7)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "filetable.h"
22 #include "io_uring.h"
23 #include "opdef.h"
24 #include "tctx.h"
25 #include "rsrc.h"
26 #include "sqpoll.h"
27 #include "register.h"
28 #include "cancel.h"
29 #include "kbuf.h"
30 #include "napi.h"
31 #include "eventfd.h"
32 #include "msg_ring.h"
33 #include "memmap.h"
34 #include "zcrx.h"
35 #include "query.h"
36 #include "bpf_filter.h"
37 
38 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
39 				 IORING_REGISTER_LAST + IORING_OP_LAST)
40 
41 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
42 			   unsigned nr_args)
43 {
44 	struct io_uring_probe *p;
45 	size_t size;
46 	int i, ret;
47 
48 	if (nr_args > IORING_OP_LAST)
49 		nr_args = IORING_OP_LAST;
50 
51 	size = struct_size(p, ops, nr_args);
52 	p = memdup_user(arg, size);
53 	if (IS_ERR(p))
54 		return PTR_ERR(p);
55 	ret = -EINVAL;
56 	if (memchr_inv(p, 0, size))
57 		goto out;
58 
59 	p->last_op = IORING_OP_LAST - 1;
60 
61 	for (i = 0; i < nr_args; i++) {
62 		p->ops[i].op = i;
63 		if (io_uring_op_supported(i))
64 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
65 	}
66 	p->ops_len = i;
67 
68 	ret = 0;
69 	if (copy_to_user(arg, p, size))
70 		ret = -EFAULT;
71 out:
72 	kfree(p);
73 	return ret;
74 }
75 
76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
77 {
78 	const struct cred *creds;
79 
80 	creds = xa_erase(&ctx->personalities, id);
81 	if (creds) {
82 		put_cred(creds);
83 		return 0;
84 	}
85 
86 	return -EINVAL;
87 }
88 
89 
90 static int io_register_personality(struct io_ring_ctx *ctx)
91 {
92 	const struct cred *creds;
93 	u32 id;
94 	int ret;
95 
96 	creds = get_current_cred();
97 
98 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
99 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
100 	if (ret < 0) {
101 		put_cred(creds);
102 		return ret;
103 	}
104 	return id;
105 }
106 
107 /*
108  * Returns number of restrictions parsed and added on success, or < 0 for
109  * an error.
110  */
111 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
112 					struct io_restriction *restrictions)
113 {
114 	struct io_uring_restriction *res;
115 	size_t size;
116 	int i, ret;
117 
118 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
119 		return -EINVAL;
120 
121 	size = array_size(nr_args, sizeof(*res));
122 	if (size == SIZE_MAX)
123 		return -EOVERFLOW;
124 
125 	res = memdup_user(arg, size);
126 	if (IS_ERR(res))
127 		return PTR_ERR(res);
128 
129 	ret = -EINVAL;
130 
131 	for (i = 0; i < nr_args; i++) {
132 		switch (res[i].opcode) {
133 		case IORING_RESTRICTION_REGISTER_OP:
134 			if (res[i].register_op >= IORING_REGISTER_LAST)
135 				goto err;
136 			__set_bit(res[i].register_op, restrictions->register_op);
137 			restrictions->reg_registered = true;
138 			break;
139 		case IORING_RESTRICTION_SQE_OP:
140 			if (res[i].sqe_op >= IORING_OP_LAST)
141 				goto err;
142 			__set_bit(res[i].sqe_op, restrictions->sqe_op);
143 			restrictions->op_registered = true;
144 			break;
145 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
146 			restrictions->sqe_flags_allowed = res[i].sqe_flags;
147 			restrictions->op_registered = true;
148 			break;
149 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
150 			restrictions->sqe_flags_required = res[i].sqe_flags;
151 			restrictions->op_registered = true;
152 			break;
153 		default:
154 			goto err;
155 		}
156 	}
157 	ret = nr_args;
158 	if (!nr_args) {
159 		restrictions->op_registered = true;
160 		restrictions->reg_registered = true;
161 	}
162 err:
163 	kfree(res);
164 	return ret;
165 }
166 
167 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
168 					   void __user *arg, unsigned int nr_args)
169 {
170 	int ret;
171 
172 	/* Restrictions allowed only if rings started disabled */
173 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
174 		return -EBADFD;
175 
176 	/* We allow only a single restrictions registration */
177 	if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered)
178 		return -EBUSY;
179 
180 	ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
181 	/*
182 	 * Reset all restrictions if an error happened, but retain any COW'ed
183 	 * settings.
184 	 */
185 	if (ret < 0) {
186 		struct io_bpf_filters *bpf = ctx->restrictions.bpf_filters;
187 		bool cowed = ctx->restrictions.bpf_filters_cow;
188 
189 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
190 		ctx->restrictions.bpf_filters = bpf;
191 		ctx->restrictions.bpf_filters_cow = cowed;
192 		return ret;
193 	}
194 	if (ctx->restrictions.op_registered)
195 		ctx->op_restricted = 1;
196 	if (ctx->restrictions.reg_registered)
197 		ctx->reg_restricted = 1;
198 	return 0;
199 }
200 
201 static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
202 {
203 	struct io_uring_task_restriction __user *ures = arg;
204 	struct io_uring_task_restriction tres;
205 	struct io_restriction *res;
206 	int ret;
207 
208 	/* Disallow if task already has registered restrictions */
209 	if (current->io_uring_restrict)
210 		return -EPERM;
211 	/*
212 	 * Similar to seccomp, disallow setting a filter if task_no_new_privs
213 	 * is false and we're not CAP_SYS_ADMIN.
214 	 */
215 	if (!task_no_new_privs(current) &&
216 	    !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
217 		return -EACCES;
218 	if (nr_args != 1)
219 		return -EINVAL;
220 
221 	if (copy_from_user(&tres, arg, sizeof(tres)))
222 		return -EFAULT;
223 
224 	if (tres.flags)
225 		return -EINVAL;
226 	if (!mem_is_zero(tres.resv, sizeof(tres.resv)))
227 		return -EINVAL;
228 
229 	res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
230 	if (!res)
231 		return -ENOMEM;
232 
233 	ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res);
234 	if (ret < 0) {
235 		kfree(res);
236 		return ret;
237 	}
238 	current->io_uring_restrict = res;
239 	return 0;
240 }
241 
242 static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
243 {
244 	struct io_restriction *res;
245 	int ret;
246 
247 	/*
248 	 * Similar to seccomp, disallow setting a filter if task_no_new_privs
249 	 * is false and we're not CAP_SYS_ADMIN.
250 	 */
251 	if (!task_no_new_privs(current) &&
252 	    !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
253 		return -EACCES;
254 
255 	if (nr_args != 1)
256 		return -EINVAL;
257 
258 	/* If no task restrictions exist, setup a new set */
259 	res = current->io_uring_restrict;
260 	if (!res) {
261 		res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
262 		if (!res)
263 			return -ENOMEM;
264 	}
265 
266 	ret = io_register_bpf_filter(res, arg);
267 	if (ret) {
268 		if (res != current->io_uring_restrict)
269 			kfree(res);
270 		return ret;
271 	}
272 	if (!current->io_uring_restrict)
273 		current->io_uring_restrict = res;
274 	return 0;
275 }
276 
277 static int io_register_enable_rings(struct io_ring_ctx *ctx)
278 {
279 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
280 		return -EBADFD;
281 
282 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) {
283 		ctx->submitter_task = get_task_struct(current);
284 		/*
285 		 * Lazy activation attempts would fail if it was polled before
286 		 * submitter_task is set.
287 		 */
288 		if (wq_has_sleeper(&ctx->poll_wq))
289 			io_activate_pollwq(ctx);
290 	}
291 
292 	/* Keep submitter_task store before clearing IORING_SETUP_R_DISABLED */
293 	smp_store_release(&ctx->flags, ctx->flags & ~IORING_SETUP_R_DISABLED);
294 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
295 		wake_up(&ctx->sq_data->wait);
296 	return 0;
297 }
298 
299 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
300 					 cpumask_var_t new_mask)
301 {
302 	int ret;
303 
304 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
305 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
306 	} else {
307 		mutex_unlock(&ctx->uring_lock);
308 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
309 		mutex_lock(&ctx->uring_lock);
310 	}
311 
312 	return ret;
313 }
314 
315 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
316 				       void __user *arg, unsigned len)
317 {
318 	cpumask_var_t new_mask;
319 	int ret;
320 
321 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
322 		return -ENOMEM;
323 
324 	cpumask_clear(new_mask);
325 	if (len > cpumask_size())
326 		len = cpumask_size();
327 
328 #ifdef CONFIG_COMPAT
329 	if (in_compat_syscall())
330 		ret = compat_get_bitmap(cpumask_bits(new_mask),
331 					(const compat_ulong_t __user *)arg,
332 					len * 8 /* CHAR_BIT */);
333 	else
334 #endif
335 		ret = copy_from_user(new_mask, arg, len);
336 
337 	if (ret) {
338 		free_cpumask_var(new_mask);
339 		return -EFAULT;
340 	}
341 
342 	ret = __io_register_iowq_aff(ctx, new_mask);
343 	free_cpumask_var(new_mask);
344 	return ret;
345 }
346 
347 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
348 {
349 	return __io_register_iowq_aff(ctx, NULL);
350 }
351 
352 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
353 					       void __user *arg)
354 	__must_hold(&ctx->uring_lock)
355 {
356 	struct io_tctx_node *node;
357 	struct io_uring_task *tctx = NULL;
358 	struct io_sq_data *sqd = NULL;
359 	__u32 new_count[2];
360 	int i, ret;
361 
362 	if (copy_from_user(new_count, arg, sizeof(new_count)))
363 		return -EFAULT;
364 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
365 		if (new_count[i] > INT_MAX)
366 			return -EINVAL;
367 
368 	if (ctx->flags & IORING_SETUP_SQPOLL) {
369 		sqd = ctx->sq_data;
370 		if (sqd) {
371 			struct task_struct *tsk;
372 
373 			/*
374 			 * Observe the correct sqd->lock -> ctx->uring_lock
375 			 * ordering. Fine to drop uring_lock here, we hold
376 			 * a ref to the ctx.
377 			 */
378 			refcount_inc(&sqd->refs);
379 			mutex_unlock(&ctx->uring_lock);
380 			mutex_lock(&sqd->lock);
381 			mutex_lock(&ctx->uring_lock);
382 			tsk = sqpoll_task_locked(sqd);
383 			if (tsk)
384 				tctx = tsk->io_uring;
385 		}
386 	} else {
387 		tctx = current->io_uring;
388 	}
389 
390 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
391 
392 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
393 		if (new_count[i])
394 			ctx->iowq_limits[i] = new_count[i];
395 	ctx->iowq_limits_set = true;
396 
397 	if (tctx && tctx->io_wq) {
398 		ret = io_wq_max_workers(tctx->io_wq, new_count);
399 		if (ret)
400 			goto err;
401 	} else {
402 		memset(new_count, 0, sizeof(new_count));
403 	}
404 
405 	if (sqd) {
406 		mutex_unlock(&ctx->uring_lock);
407 		mutex_unlock(&sqd->lock);
408 		io_put_sq_data(sqd);
409 		mutex_lock(&ctx->uring_lock);
410 	}
411 
412 	if (copy_to_user(arg, new_count, sizeof(new_count)))
413 		return -EFAULT;
414 
415 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
416 	if (sqd)
417 		return 0;
418 
419 	/* now propagate the restriction to all registered users */
420 	mutex_lock(&ctx->tctx_lock);
421 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
422 		tctx = node->task->io_uring;
423 		if (WARN_ON_ONCE(!tctx->io_wq))
424 			continue;
425 
426 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
427 			new_count[i] = ctx->iowq_limits[i];
428 		/* ignore errors, it always returns zero anyway */
429 		(void)io_wq_max_workers(tctx->io_wq, new_count);
430 	}
431 	mutex_unlock(&ctx->tctx_lock);
432 	return 0;
433 err:
434 	if (sqd) {
435 		mutex_unlock(&ctx->uring_lock);
436 		mutex_unlock(&sqd->lock);
437 		io_put_sq_data(sqd);
438 		mutex_lock(&ctx->uring_lock);
439 	}
440 	return ret;
441 }
442 
443 static int io_register_clock(struct io_ring_ctx *ctx,
444 			     struct io_uring_clock_register __user *arg)
445 {
446 	struct io_uring_clock_register reg;
447 
448 	if (copy_from_user(&reg, arg, sizeof(reg)))
449 		return -EFAULT;
450 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
451 		return -EINVAL;
452 
453 	switch (reg.clockid) {
454 	case CLOCK_MONOTONIC:
455 		ctx->clock_offset = 0;
456 		break;
457 	case CLOCK_BOOTTIME:
458 		ctx->clock_offset = TK_OFFS_BOOT;
459 		break;
460 	default:
461 		return -EINVAL;
462 	}
463 
464 	ctx->clockid = reg.clockid;
465 	return 0;
466 }
467 
468 /*
469  * State to maintain until we can swap. Both new and old state, used for
470  * either mapping or freeing.
471  */
472 struct io_ring_ctx_rings {
473 	struct io_rings *rings;
474 	struct io_uring_sqe *sq_sqes;
475 
476 	struct io_mapped_region sq_region;
477 	struct io_mapped_region ring_region;
478 };
479 
480 static void io_register_free_rings(struct io_ring_ctx *ctx,
481 				   struct io_ring_ctx_rings *r)
482 {
483 	io_free_region(ctx->user, &r->sq_region);
484 	io_free_region(ctx->user, &r->ring_region);
485 }
486 
487 #define swap_old(ctx, o, n, field)		\
488 	do {					\
489 		(o).field = (ctx)->field;	\
490 		(ctx)->field = (n).field;	\
491 	} while (0)
492 
493 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
494 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
495 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
496 			 IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
497 
498 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
499 {
500 	struct io_ctx_config config;
501 	struct io_uring_region_desc rd;
502 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
503 	unsigned i, tail, old_head;
504 	struct io_uring_params *p = &config.p;
505 	struct io_rings_layout *rl = &config.layout;
506 	int ret;
507 
508 	memset(&config, 0, sizeof(config));
509 
510 	/* limited to DEFER_TASKRUN for now */
511 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
512 		return -EINVAL;
513 	if (copy_from_user(p, arg, sizeof(*p)))
514 		return -EFAULT;
515 	if (p->flags & ~RESIZE_FLAGS)
516 		return -EINVAL;
517 
518 	/* properties that are always inherited */
519 	p->flags |= (ctx->flags & COPY_FLAGS);
520 
521 	ret = io_prepare_config(&config);
522 	if (unlikely(ret))
523 		return ret;
524 
525 	memset(&rd, 0, sizeof(rd));
526 	rd.size = PAGE_ALIGN(rl->rings_size);
527 	if (p->flags & IORING_SETUP_NO_MMAP) {
528 		rd.user_addr = p->cq_off.user_addr;
529 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
530 	}
531 	ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
532 	if (ret)
533 		return ret;
534 
535 	n.rings = io_region_get_ptr(&n.ring_region);
536 
537 	/*
538 	 * At this point n.rings is shared with userspace, just like o.rings
539 	 * is as well. While we don't expect userspace to modify it while
540 	 * a resize is in progress, and it's most likely that userspace will
541 	 * shoot itself in the foot if it does, we can't always assume good
542 	 * intent... Use read/write once helpers from here on to indicate the
543 	 * shared nature of it.
544 	 */
545 	WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1);
546 	WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1);
547 	WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries);
548 	WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries);
549 
550 	if (copy_to_user(arg, p, sizeof(*p))) {
551 		io_register_free_rings(ctx, &n);
552 		return -EFAULT;
553 	}
554 
555 	memset(&rd, 0, sizeof(rd));
556 	rd.size = PAGE_ALIGN(rl->sq_size);
557 	if (p->flags & IORING_SETUP_NO_MMAP) {
558 		rd.user_addr = p->sq_off.user_addr;
559 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
560 	}
561 	ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
562 	if (ret) {
563 		io_register_free_rings(ctx, &n);
564 		return ret;
565 	}
566 	n.sq_sqes = io_region_get_ptr(&n.sq_region);
567 
568 	/*
569 	 * If using SQPOLL, park the thread
570 	 */
571 	if (ctx->sq_data) {
572 		mutex_unlock(&ctx->uring_lock);
573 		io_sq_thread_park(ctx->sq_data);
574 		mutex_lock(&ctx->uring_lock);
575 	}
576 
577 	/*
578 	 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
579 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
580 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
581 	 * existing rings beyond this point will fail. Not that it could proceed
582 	 * at this point anyway, as the io_uring mmap side needs go grab the
583 	 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
584 	 * duration of the actual swap.
585 	 */
586 	mutex_lock(&ctx->mmap_lock);
587 	spin_lock(&ctx->completion_lock);
588 	o.rings = ctx->rings;
589 	ctx->rings = NULL;
590 	o.sq_sqes = ctx->sq_sqes;
591 	ctx->sq_sqes = NULL;
592 
593 	/*
594 	 * Now copy SQ and CQ entries, if any. If either of the destination
595 	 * rings can't hold what is already there, then fail the operation.
596 	 */
597 	tail = READ_ONCE(o.rings->sq.tail);
598 	old_head = READ_ONCE(o.rings->sq.head);
599 	if (tail - old_head > p->sq_entries)
600 		goto overflow;
601 	for (i = old_head; i < tail; i++) {
602 		unsigned src_head = i & (ctx->sq_entries - 1);
603 		unsigned dst_head = i & (p->sq_entries - 1);
604 
605 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
606 	}
607 	WRITE_ONCE(n.rings->sq.head, old_head);
608 	WRITE_ONCE(n.rings->sq.tail, tail);
609 
610 	tail = READ_ONCE(o.rings->cq.tail);
611 	old_head = READ_ONCE(o.rings->cq.head);
612 	if (tail - old_head > p->cq_entries) {
613 overflow:
614 		/* restore old rings, and return -EOVERFLOW via cleanup path */
615 		ctx->rings = o.rings;
616 		ctx->sq_sqes = o.sq_sqes;
617 		to_free = &n;
618 		ret = -EOVERFLOW;
619 		goto out;
620 	}
621 	for (i = old_head; i < tail; i++) {
622 		unsigned src_head = i & (ctx->cq_entries - 1);
623 		unsigned dst_head = i & (p->cq_entries - 1);
624 
625 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
626 	}
627 	WRITE_ONCE(n.rings->cq.head, old_head);
628 	WRITE_ONCE(n.rings->cq.tail, tail);
629 	/* invalidate cached cqe refill */
630 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
631 
632 	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
633 	atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
634 	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
635 	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
636 
637 	/* all done, store old pointers and assign new ones */
638 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
639 		ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);
640 
641 	ctx->sq_entries = p->sq_entries;
642 	ctx->cq_entries = p->cq_entries;
643 
644 	/*
645 	 * Just mark any flag we may have missed and that the application
646 	 * should act on unconditionally. Worst case it'll be an extra
647 	 * syscall.
648 	 */
649 	atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags);
650 	ctx->rings = n.rings;
651 	rcu_assign_pointer(ctx->rings_rcu, n.rings);
652 
653 	ctx->sq_sqes = n.sq_sqes;
654 	swap_old(ctx, o, n, ring_region);
655 	swap_old(ctx, o, n, sq_region);
656 	to_free = &o;
657 	ret = 0;
658 out:
659 	spin_unlock(&ctx->completion_lock);
660 	mutex_unlock(&ctx->mmap_lock);
661 	/* Wait for concurrent io_ctx_mark_taskrun() */
662 	if (to_free == &o)
663 		synchronize_rcu_expedited();
664 	io_register_free_rings(ctx, to_free);
665 
666 	if (ctx->sq_data)
667 		io_sq_thread_unpark(ctx->sq_data);
668 
669 	return ret;
670 }
671 
672 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
673 {
674 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
675 	struct io_uring_mem_region_reg reg;
676 	struct io_uring_region_desc __user *rd_uptr;
677 	struct io_uring_region_desc rd;
678 	struct io_mapped_region region = {};
679 	int ret;
680 
681 	if (io_region_is_set(&ctx->param_region))
682 		return -EBUSY;
683 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
684 		return -EFAULT;
685 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
686 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
687 		return -EFAULT;
688 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
689 		return -EINVAL;
690 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
691 		return -EINVAL;
692 
693 	/*
694 	 * This ensures there are no waiters. Waiters are unlocked and it's
695 	 * hard to synchronise with them, especially if we need to initialise
696 	 * the region.
697 	 */
698 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
699 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
700 		return -EINVAL;
701 
702 	ret = io_create_region(ctx, &region, &rd, IORING_MAP_OFF_PARAM_REGION);
703 	if (ret)
704 		return ret;
705 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
706 		io_free_region(ctx->user, &region);
707 		return -EFAULT;
708 	}
709 
710 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
711 		ctx->cq_wait_arg = io_region_get_ptr(&region);
712 		ctx->cq_wait_size = rd.size;
713 	}
714 
715 	io_region_publish(ctx, &region, &ctx->param_region);
716 	return 0;
717 }
718 
719 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
720 			       void __user *arg, unsigned nr_args)
721 	__releases(ctx->uring_lock)
722 	__acquires(ctx->uring_lock)
723 {
724 	int ret;
725 
726 	/*
727 	 * We don't quiesce the refs for register anymore and so it can't be
728 	 * dying as we're holding a file ref here.
729 	 */
730 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
731 		return -ENXIO;
732 
733 	if (ctx->submitter_task && ctx->submitter_task != current)
734 		return -EEXIST;
735 
736 	if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
737 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
738 		if (!test_bit(opcode, ctx->restrictions.register_op))
739 			return -EACCES;
740 	}
741 
742 	switch (opcode) {
743 	case IORING_REGISTER_BUFFERS:
744 		ret = -EFAULT;
745 		if (!arg)
746 			break;
747 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
748 		break;
749 	case IORING_UNREGISTER_BUFFERS:
750 		ret = -EINVAL;
751 		if (arg || nr_args)
752 			break;
753 		ret = io_sqe_buffers_unregister(ctx);
754 		break;
755 	case IORING_REGISTER_FILES:
756 		ret = -EFAULT;
757 		if (!arg)
758 			break;
759 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
760 		break;
761 	case IORING_UNREGISTER_FILES:
762 		ret = -EINVAL;
763 		if (arg || nr_args)
764 			break;
765 		ret = io_sqe_files_unregister(ctx);
766 		break;
767 	case IORING_REGISTER_FILES_UPDATE:
768 		ret = io_register_files_update(ctx, arg, nr_args);
769 		break;
770 	case IORING_REGISTER_EVENTFD:
771 		ret = -EINVAL;
772 		if (nr_args != 1)
773 			break;
774 		ret = io_eventfd_register(ctx, arg, 0);
775 		break;
776 	case IORING_REGISTER_EVENTFD_ASYNC:
777 		ret = -EINVAL;
778 		if (nr_args != 1)
779 			break;
780 		ret = io_eventfd_register(ctx, arg, 1);
781 		break;
782 	case IORING_UNREGISTER_EVENTFD:
783 		ret = -EINVAL;
784 		if (arg || nr_args)
785 			break;
786 		ret = io_eventfd_unregister(ctx);
787 		break;
788 	case IORING_REGISTER_PROBE:
789 		ret = -EINVAL;
790 		if (!arg || nr_args > 256)
791 			break;
792 		ret = io_probe(ctx, arg, nr_args);
793 		break;
794 	case IORING_REGISTER_PERSONALITY:
795 		ret = -EINVAL;
796 		if (arg || nr_args)
797 			break;
798 		ret = io_register_personality(ctx);
799 		break;
800 	case IORING_UNREGISTER_PERSONALITY:
801 		ret = -EINVAL;
802 		if (arg)
803 			break;
804 		ret = io_unregister_personality(ctx, nr_args);
805 		break;
806 	case IORING_REGISTER_ENABLE_RINGS:
807 		ret = -EINVAL;
808 		if (arg || nr_args)
809 			break;
810 		ret = io_register_enable_rings(ctx);
811 		break;
812 	case IORING_REGISTER_RESTRICTIONS:
813 		ret = io_register_restrictions(ctx, arg, nr_args);
814 		break;
815 	case IORING_REGISTER_FILES2:
816 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
817 		break;
818 	case IORING_REGISTER_FILES_UPDATE2:
819 		ret = io_register_rsrc_update(ctx, arg, nr_args,
820 					      IORING_RSRC_FILE);
821 		break;
822 	case IORING_REGISTER_BUFFERS2:
823 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
824 		break;
825 	case IORING_REGISTER_BUFFERS_UPDATE:
826 		ret = io_register_rsrc_update(ctx, arg, nr_args,
827 					      IORING_RSRC_BUFFER);
828 		break;
829 	case IORING_REGISTER_IOWQ_AFF:
830 		ret = -EINVAL;
831 		if (!arg || !nr_args)
832 			break;
833 		ret = io_register_iowq_aff(ctx, arg, nr_args);
834 		break;
835 	case IORING_UNREGISTER_IOWQ_AFF:
836 		ret = -EINVAL;
837 		if (arg || nr_args)
838 			break;
839 		ret = io_unregister_iowq_aff(ctx);
840 		break;
841 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
842 		ret = -EINVAL;
843 		if (!arg || nr_args != 2)
844 			break;
845 		ret = io_register_iowq_max_workers(ctx, arg);
846 		break;
847 	case IORING_REGISTER_RING_FDS:
848 		ret = io_ringfd_register(ctx, arg, nr_args);
849 		break;
850 	case IORING_UNREGISTER_RING_FDS:
851 		ret = io_ringfd_unregister(ctx, arg, nr_args);
852 		break;
853 	case IORING_REGISTER_PBUF_RING:
854 		ret = -EINVAL;
855 		if (!arg || nr_args != 1)
856 			break;
857 		ret = io_register_pbuf_ring(ctx, arg);
858 		break;
859 	case IORING_UNREGISTER_PBUF_RING:
860 		ret = -EINVAL;
861 		if (!arg || nr_args != 1)
862 			break;
863 		ret = io_unregister_pbuf_ring(ctx, arg);
864 		break;
865 	case IORING_REGISTER_SYNC_CANCEL:
866 		ret = -EINVAL;
867 		if (!arg || nr_args != 1)
868 			break;
869 		ret = io_sync_cancel(ctx, arg);
870 		break;
871 	case IORING_REGISTER_FILE_ALLOC_RANGE:
872 		ret = -EINVAL;
873 		if (!arg || nr_args)
874 			break;
875 		ret = io_register_file_alloc_range(ctx, arg);
876 		break;
877 	case IORING_REGISTER_PBUF_STATUS:
878 		ret = -EINVAL;
879 		if (!arg || nr_args != 1)
880 			break;
881 		ret = io_register_pbuf_status(ctx, arg);
882 		break;
883 	case IORING_REGISTER_NAPI:
884 		ret = -EINVAL;
885 		if (!arg || nr_args != 1)
886 			break;
887 		ret = io_register_napi(ctx, arg);
888 		break;
889 	case IORING_UNREGISTER_NAPI:
890 		ret = -EINVAL;
891 		if (nr_args != 1)
892 			break;
893 		ret = io_unregister_napi(ctx, arg);
894 		break;
895 	case IORING_REGISTER_CLOCK:
896 		ret = -EINVAL;
897 		if (!arg || nr_args)
898 			break;
899 		ret = io_register_clock(ctx, arg);
900 		break;
901 	case IORING_REGISTER_CLONE_BUFFERS:
902 		ret = -EINVAL;
903 		if (!arg || nr_args != 1)
904 			break;
905 		ret = io_register_clone_buffers(ctx, arg);
906 		break;
907 	case IORING_REGISTER_ZCRX_IFQ:
908 		ret = -EINVAL;
909 		if (!arg || nr_args != 1)
910 			break;
911 		ret = io_register_zcrx_ifq(ctx, arg);
912 		break;
913 	case IORING_REGISTER_RESIZE_RINGS:
914 		ret = -EINVAL;
915 		if (!arg || nr_args != 1)
916 			break;
917 		ret = io_register_resize_rings(ctx, arg);
918 		break;
919 	case IORING_REGISTER_MEM_REGION:
920 		ret = -EINVAL;
921 		if (!arg || nr_args != 1)
922 			break;
923 		ret = io_register_mem_region(ctx, arg);
924 		break;
925 	case IORING_REGISTER_QUERY:
926 		ret = io_query(arg, nr_args);
927 		break;
928 	case IORING_REGISTER_ZCRX_CTRL:
929 		ret = io_zcrx_ctrl(ctx, arg, nr_args);
930 		break;
931 	case IORING_REGISTER_BPF_FILTER:
932 		ret = -EINVAL;
933 
934 		if (nr_args != 1)
935 			break;
936 		ret = io_register_bpf_filter(&ctx->restrictions, arg);
937 		if (!ret)
938 			WRITE_ONCE(ctx->bpf_filters,
939 				   ctx->restrictions.bpf_filters->filters);
940 		break;
941 	default:
942 		ret = -EINVAL;
943 		break;
944 	}
945 
946 	return ret;
947 }
948 
949 /*
950  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
951  * true, then the registered index is used. Otherwise, the normal fd table.
952  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
953  */
954 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
955 {
956 	struct file *file;
957 
958 	if (registered) {
959 		/*
960 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
961 		 * need only dereference our task private array to find it.
962 		 */
963 		struct io_uring_task *tctx = current->io_uring;
964 
965 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
966 			return ERR_PTR(-EINVAL);
967 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
968 		file = tctx->registered_rings[fd];
969 		if (file)
970 			get_file(file);
971 	} else {
972 		file = fget(fd);
973 	}
974 
975 	if (unlikely(!file))
976 		return ERR_PTR(-EBADF);
977 	if (io_is_uring_fops(file))
978 		return file;
979 	fput(file);
980 	return ERR_PTR(-EOPNOTSUPP);
981 }
982 
983 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
984 {
985 	struct io_uring_sqe sqe;
986 
987 	if (!arg || nr_args != 1)
988 		return -EINVAL;
989 	if (copy_from_user(&sqe, arg, sizeof(sqe)))
990 		return -EFAULT;
991 	/* no flags supported */
992 	if (sqe.flags)
993 		return -EINVAL;
994 	if (sqe.opcode != IORING_OP_MSG_RING)
995 		return -EINVAL;
996 
997 	return io_uring_sync_msg_ring(&sqe);
998 }
999 
1000 /*
1001  * "blind" registration opcodes are ones where there's no ring given, and
1002  * hence the source fd must be -1.
1003  */
1004 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
1005 				   unsigned int nr_args)
1006 {
1007 	switch (opcode) {
1008 	case IORING_REGISTER_SEND_MSG_RING:
1009 		return io_uring_register_send_msg_ring(arg, nr_args);
1010 	case IORING_REGISTER_QUERY:
1011 		return io_query(arg, nr_args);
1012 	case IORING_REGISTER_RESTRICTIONS:
1013 		return io_register_restrictions_task(arg, nr_args);
1014 	case IORING_REGISTER_BPF_FILTER:
1015 		return io_register_bpf_filter_task(arg, nr_args);
1016 	}
1017 	return -EINVAL;
1018 }
1019 
1020 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
1021 		void __user *, arg, unsigned int, nr_args)
1022 {
1023 	struct io_ring_ctx *ctx;
1024 	long ret = -EBADF;
1025 	struct file *file;
1026 	bool use_registered_ring;
1027 
1028 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
1029 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
1030 
1031 	if (opcode >= IORING_REGISTER_LAST)
1032 		return -EINVAL;
1033 
1034 	if (fd == -1)
1035 		return io_uring_register_blind(opcode, arg, nr_args);
1036 
1037 	file = io_uring_register_get_file(fd, use_registered_ring);
1038 	if (IS_ERR(file))
1039 		return PTR_ERR(file);
1040 	ctx = file->private_data;
1041 
1042 	mutex_lock(&ctx->uring_lock);
1043 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
1044 
1045 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
1046 				ctx->buf_table.nr, ret);
1047 	mutex_unlock(&ctx->uring_lock);
1048 
1049 	fput(file);
1050 	return ret;
1051 }
1052