xref: /linux/io_uring/register.c (revision 591beb0e3a03258ef9c01893a5209845799a7c33)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "filetable.h"
22 #include "io_uring.h"
23 #include "opdef.h"
24 #include "tctx.h"
25 #include "rsrc.h"
26 #include "sqpoll.h"
27 #include "register.h"
28 #include "cancel.h"
29 #include "kbuf.h"
30 #include "napi.h"
31 #include "eventfd.h"
32 #include "msg_ring.h"
33 #include "memmap.h"
34 #include "zcrx.h"
35 #include "query.h"
36 #include "bpf_filter.h"
37 
38 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
39 				 IORING_REGISTER_LAST + IORING_OP_LAST)
40 
41 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
42 			   unsigned nr_args)
43 {
44 	struct io_uring_probe *p;
45 	size_t size;
46 	int i, ret;
47 
48 	if (nr_args > IORING_OP_LAST)
49 		nr_args = IORING_OP_LAST;
50 
51 	size = struct_size(p, ops, nr_args);
52 	p = memdup_user(arg, size);
53 	if (IS_ERR(p))
54 		return PTR_ERR(p);
55 	ret = -EINVAL;
56 	if (memchr_inv(p, 0, size))
57 		goto out;
58 
59 	p->last_op = IORING_OP_LAST - 1;
60 
61 	for (i = 0; i < nr_args; i++) {
62 		p->ops[i].op = i;
63 		if (io_uring_op_supported(i))
64 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
65 	}
66 	p->ops_len = i;
67 
68 	ret = 0;
69 	if (copy_to_user(arg, p, size))
70 		ret = -EFAULT;
71 out:
72 	kfree(p);
73 	return ret;
74 }
75 
76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
77 {
78 	const struct cred *creds;
79 
80 	creds = xa_erase(&ctx->personalities, id);
81 	if (creds) {
82 		put_cred(creds);
83 		return 0;
84 	}
85 
86 	return -EINVAL;
87 }
88 
89 
90 static int io_register_personality(struct io_ring_ctx *ctx)
91 {
92 	const struct cred *creds;
93 	u32 id;
94 	int ret;
95 
96 	creds = get_current_cred();
97 
98 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
99 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
100 	if (ret < 0) {
101 		put_cred(creds);
102 		return ret;
103 	}
104 	return id;
105 }
106 
107 /*
108  * Returns number of restrictions parsed and added on success, or < 0 for
109  * an error.
110  */
111 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
112 					struct io_restriction *restrictions)
113 {
114 	struct io_uring_restriction *res;
115 	size_t size;
116 	int i, ret;
117 
118 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
119 		return -EINVAL;
120 
121 	size = array_size(nr_args, sizeof(*res));
122 	if (size == SIZE_MAX)
123 		return -EOVERFLOW;
124 
125 	res = memdup_user(arg, size);
126 	if (IS_ERR(res))
127 		return PTR_ERR(res);
128 
129 	ret = -EINVAL;
130 
131 	for (i = 0; i < nr_args; i++) {
132 		switch (res[i].opcode) {
133 		case IORING_RESTRICTION_REGISTER_OP:
134 			if (res[i].register_op >= IORING_REGISTER_LAST)
135 				goto err;
136 			__set_bit(res[i].register_op, restrictions->register_op);
137 			restrictions->reg_registered = true;
138 			break;
139 		case IORING_RESTRICTION_SQE_OP:
140 			if (res[i].sqe_op >= IORING_OP_LAST)
141 				goto err;
142 			__set_bit(res[i].sqe_op, restrictions->sqe_op);
143 			restrictions->op_registered = true;
144 			break;
145 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
146 			restrictions->sqe_flags_allowed = res[i].sqe_flags;
147 			restrictions->op_registered = true;
148 			break;
149 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
150 			restrictions->sqe_flags_required = res[i].sqe_flags;
151 			restrictions->op_registered = true;
152 			break;
153 		default:
154 			goto err;
155 		}
156 	}
157 	ret = nr_args;
158 	if (!nr_args) {
159 		restrictions->op_registered = true;
160 		restrictions->reg_registered = true;
161 	}
162 err:
163 	kfree(res);
164 	return ret;
165 }
166 
167 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
168 					   void __user *arg, unsigned int nr_args)
169 {
170 	int ret;
171 
172 	/* Restrictions allowed only if rings started disabled */
173 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
174 		return -EBADFD;
175 
176 	/* We allow only a single restrictions registration */
177 	if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered)
178 		return -EBUSY;
179 
180 	ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
181 	/* Reset all restrictions if an error happened */
182 	if (ret < 0) {
183 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
184 		return ret;
185 	}
186 	if (ctx->restrictions.op_registered)
187 		ctx->op_restricted = 1;
188 	if (ctx->restrictions.reg_registered)
189 		ctx->reg_restricted = 1;
190 	return 0;
191 }
192 
193 static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
194 {
195 	struct io_uring_task_restriction __user *ures = arg;
196 	struct io_uring_task_restriction tres;
197 	struct io_restriction *res;
198 	int ret;
199 
200 	/* Disallow if task already has registered restrictions */
201 	if (current->io_uring_restrict)
202 		return -EPERM;
203 	/*
204 	 * Similar to seccomp, disallow setting a filter if task_no_new_privs
205 	 * is true and we're not CAP_SYS_ADMIN.
206 	 */
207 	if (!task_no_new_privs(current) &&
208 	    !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
209 		return -EACCES;
210 	if (nr_args != 1)
211 		return -EINVAL;
212 
213 	if (copy_from_user(&tres, arg, sizeof(tres)))
214 		return -EFAULT;
215 
216 	if (tres.flags)
217 		return -EINVAL;
218 	if (!mem_is_zero(tres.resv, sizeof(tres.resv)))
219 		return -EINVAL;
220 
221 	res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
222 	if (!res)
223 		return -ENOMEM;
224 
225 	ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res);
226 	if (ret < 0) {
227 		kfree(res);
228 		return ret;
229 	}
230 	current->io_uring_restrict = res;
231 	return 0;
232 }
233 
234 static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
235 {
236 	struct io_restriction *res;
237 	int ret;
238 
239 	/*
240 	 * Similar to seccomp, disallow setting a filter if task_no_new_privs
241 	 * is true and we're not CAP_SYS_ADMIN.
242 	 */
243 	if (!task_no_new_privs(current) &&
244 	    !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
245 		return -EACCES;
246 
247 	if (nr_args != 1)
248 		return -EINVAL;
249 
250 	/* If no task restrictions exist, setup a new set */
251 	res = current->io_uring_restrict;
252 	if (!res) {
253 		res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
254 		if (!res)
255 			return -ENOMEM;
256 	}
257 
258 	ret = io_register_bpf_filter(res, arg);
259 	if (ret) {
260 		if (res != current->io_uring_restrict)
261 			kfree(res);
262 		return ret;
263 	}
264 	if (!current->io_uring_restrict)
265 		current->io_uring_restrict = res;
266 	return 0;
267 }
268 
269 static int io_register_enable_rings(struct io_ring_ctx *ctx)
270 {
271 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
272 		return -EBADFD;
273 
274 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) {
275 		ctx->submitter_task = get_task_struct(current);
276 		/*
277 		 * Lazy activation attempts would fail if it was polled before
278 		 * submitter_task is set.
279 		 */
280 		if (wq_has_sleeper(&ctx->poll_wq))
281 			io_activate_pollwq(ctx);
282 	}
283 
284 	/* Keep submitter_task store before clearing IORING_SETUP_R_DISABLED */
285 	smp_store_release(&ctx->flags, ctx->flags & ~IORING_SETUP_R_DISABLED);
286 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
287 		wake_up(&ctx->sq_data->wait);
288 	return 0;
289 }
290 
291 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
292 					 cpumask_var_t new_mask)
293 {
294 	int ret;
295 
296 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
297 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
298 	} else {
299 		mutex_unlock(&ctx->uring_lock);
300 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
301 		mutex_lock(&ctx->uring_lock);
302 	}
303 
304 	return ret;
305 }
306 
307 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
308 				       void __user *arg, unsigned len)
309 {
310 	cpumask_var_t new_mask;
311 	int ret;
312 
313 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
314 		return -ENOMEM;
315 
316 	cpumask_clear(new_mask);
317 	if (len > cpumask_size())
318 		len = cpumask_size();
319 
320 #ifdef CONFIG_COMPAT
321 	if (in_compat_syscall())
322 		ret = compat_get_bitmap(cpumask_bits(new_mask),
323 					(const compat_ulong_t __user *)arg,
324 					len * 8 /* CHAR_BIT */);
325 	else
326 #endif
327 		ret = copy_from_user(new_mask, arg, len);
328 
329 	if (ret) {
330 		free_cpumask_var(new_mask);
331 		return -EFAULT;
332 	}
333 
334 	ret = __io_register_iowq_aff(ctx, new_mask);
335 	free_cpumask_var(new_mask);
336 	return ret;
337 }
338 
339 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
340 {
341 	return __io_register_iowq_aff(ctx, NULL);
342 }
343 
344 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
345 					       void __user *arg)
346 	__must_hold(&ctx->uring_lock)
347 {
348 	struct io_tctx_node *node;
349 	struct io_uring_task *tctx = NULL;
350 	struct io_sq_data *sqd = NULL;
351 	__u32 new_count[2];
352 	int i, ret;
353 
354 	if (copy_from_user(new_count, arg, sizeof(new_count)))
355 		return -EFAULT;
356 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
357 		if (new_count[i] > INT_MAX)
358 			return -EINVAL;
359 
360 	if (ctx->flags & IORING_SETUP_SQPOLL) {
361 		sqd = ctx->sq_data;
362 		if (sqd) {
363 			struct task_struct *tsk;
364 
365 			/*
366 			 * Observe the correct sqd->lock -> ctx->uring_lock
367 			 * ordering. Fine to drop uring_lock here, we hold
368 			 * a ref to the ctx.
369 			 */
370 			refcount_inc(&sqd->refs);
371 			mutex_unlock(&ctx->uring_lock);
372 			mutex_lock(&sqd->lock);
373 			mutex_lock(&ctx->uring_lock);
374 			tsk = sqpoll_task_locked(sqd);
375 			if (tsk)
376 				tctx = tsk->io_uring;
377 		}
378 	} else {
379 		tctx = current->io_uring;
380 	}
381 
382 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
383 
384 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
385 		if (new_count[i])
386 			ctx->iowq_limits[i] = new_count[i];
387 	ctx->iowq_limits_set = true;
388 
389 	if (tctx && tctx->io_wq) {
390 		ret = io_wq_max_workers(tctx->io_wq, new_count);
391 		if (ret)
392 			goto err;
393 	} else {
394 		memset(new_count, 0, sizeof(new_count));
395 	}
396 
397 	if (sqd) {
398 		mutex_unlock(&ctx->uring_lock);
399 		mutex_unlock(&sqd->lock);
400 		io_put_sq_data(sqd);
401 		mutex_lock(&ctx->uring_lock);
402 	}
403 
404 	if (copy_to_user(arg, new_count, sizeof(new_count)))
405 		return -EFAULT;
406 
407 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
408 	if (sqd)
409 		return 0;
410 
411 	/* now propagate the restriction to all registered users */
412 	mutex_lock(&ctx->tctx_lock);
413 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
414 		tctx = node->task->io_uring;
415 		if (WARN_ON_ONCE(!tctx->io_wq))
416 			continue;
417 
418 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
419 			new_count[i] = ctx->iowq_limits[i];
420 		/* ignore errors, it always returns zero anyway */
421 		(void)io_wq_max_workers(tctx->io_wq, new_count);
422 	}
423 	mutex_unlock(&ctx->tctx_lock);
424 	return 0;
425 err:
426 	if (sqd) {
427 		mutex_unlock(&ctx->uring_lock);
428 		mutex_unlock(&sqd->lock);
429 		io_put_sq_data(sqd);
430 		mutex_lock(&ctx->uring_lock);
431 	}
432 	return ret;
433 }
434 
435 static int io_register_clock(struct io_ring_ctx *ctx,
436 			     struct io_uring_clock_register __user *arg)
437 {
438 	struct io_uring_clock_register reg;
439 
440 	if (copy_from_user(&reg, arg, sizeof(reg)))
441 		return -EFAULT;
442 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
443 		return -EINVAL;
444 
445 	switch (reg.clockid) {
446 	case CLOCK_MONOTONIC:
447 		ctx->clock_offset = 0;
448 		break;
449 	case CLOCK_BOOTTIME:
450 		ctx->clock_offset = TK_OFFS_BOOT;
451 		break;
452 	default:
453 		return -EINVAL;
454 	}
455 
456 	ctx->clockid = reg.clockid;
457 	return 0;
458 }
459 
460 /*
461  * State to maintain until we can swap. Both new and old state, used for
462  * either mapping or freeing.
463  */
464 struct io_ring_ctx_rings {
465 	struct io_rings *rings;
466 	struct io_uring_sqe *sq_sqes;
467 
468 	struct io_mapped_region sq_region;
469 	struct io_mapped_region ring_region;
470 };
471 
472 static void io_register_free_rings(struct io_ring_ctx *ctx,
473 				   struct io_ring_ctx_rings *r)
474 {
475 	io_free_region(ctx->user, &r->sq_region);
476 	io_free_region(ctx->user, &r->ring_region);
477 }
478 
479 #define swap_old(ctx, o, n, field)		\
480 	do {					\
481 		(o).field = (ctx)->field;	\
482 		(ctx)->field = (n).field;	\
483 	} while (0)
484 
485 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
486 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
487 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
488 			 IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
489 
490 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
491 {
492 	struct io_ctx_config config;
493 	struct io_uring_region_desc rd;
494 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
495 	unsigned i, tail, old_head;
496 	struct io_uring_params *p = &config.p;
497 	struct io_rings_layout *rl = &config.layout;
498 	int ret;
499 
500 	memset(&config, 0, sizeof(config));
501 
502 	/* limited to DEFER_TASKRUN for now */
503 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
504 		return -EINVAL;
505 	if (copy_from_user(p, arg, sizeof(*p)))
506 		return -EFAULT;
507 	if (p->flags & ~RESIZE_FLAGS)
508 		return -EINVAL;
509 
510 	/* properties that are always inherited */
511 	p->flags |= (ctx->flags & COPY_FLAGS);
512 
513 	ret = io_prepare_config(&config);
514 	if (unlikely(ret))
515 		return ret;
516 
517 	memset(&rd, 0, sizeof(rd));
518 	rd.size = PAGE_ALIGN(rl->rings_size);
519 	if (p->flags & IORING_SETUP_NO_MMAP) {
520 		rd.user_addr = p->cq_off.user_addr;
521 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
522 	}
523 	ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
524 	if (ret)
525 		return ret;
526 
527 	n.rings = io_region_get_ptr(&n.ring_region);
528 
529 	/*
530 	 * At this point n.rings is shared with userspace, just like o.rings
531 	 * is as well. While we don't expect userspace to modify it while
532 	 * a resize is in progress, and it's most likely that userspace will
533 	 * shoot itself in the foot if it does, we can't always assume good
534 	 * intent... Use read/write once helpers from here on to indicate the
535 	 * shared nature of it.
536 	 */
537 	WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1);
538 	WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1);
539 	WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries);
540 	WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries);
541 
542 	if (copy_to_user(arg, p, sizeof(*p))) {
543 		io_register_free_rings(ctx, &n);
544 		return -EFAULT;
545 	}
546 
547 	memset(&rd, 0, sizeof(rd));
548 	rd.size = PAGE_ALIGN(rl->sq_size);
549 	if (p->flags & IORING_SETUP_NO_MMAP) {
550 		rd.user_addr = p->sq_off.user_addr;
551 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
552 	}
553 	ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
554 	if (ret) {
555 		io_register_free_rings(ctx, &n);
556 		return ret;
557 	}
558 	n.sq_sqes = io_region_get_ptr(&n.sq_region);
559 
560 	/*
561 	 * If using SQPOLL, park the thread
562 	 */
563 	if (ctx->sq_data) {
564 		mutex_unlock(&ctx->uring_lock);
565 		io_sq_thread_park(ctx->sq_data);
566 		mutex_lock(&ctx->uring_lock);
567 	}
568 
569 	/*
570 	 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
571 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
572 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
573 	 * existing rings beyond this point will fail. Not that it could proceed
574 	 * at this point anyway, as the io_uring mmap side needs go grab the
575 	 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
576 	 * duration of the actual swap.
577 	 */
578 	mutex_lock(&ctx->mmap_lock);
579 	spin_lock(&ctx->completion_lock);
580 	o.rings = ctx->rings;
581 	ctx->rings = NULL;
582 	o.sq_sqes = ctx->sq_sqes;
583 	ctx->sq_sqes = NULL;
584 
585 	/*
586 	 * Now copy SQ and CQ entries, if any. If either of the destination
587 	 * rings can't hold what is already there, then fail the operation.
588 	 */
589 	tail = READ_ONCE(o.rings->sq.tail);
590 	old_head = READ_ONCE(o.rings->sq.head);
591 	if (tail - old_head > p->sq_entries)
592 		goto overflow;
593 	for (i = old_head; i < tail; i++) {
594 		unsigned src_head = i & (ctx->sq_entries - 1);
595 		unsigned dst_head = i & (p->sq_entries - 1);
596 
597 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
598 	}
599 	WRITE_ONCE(n.rings->sq.head, old_head);
600 	WRITE_ONCE(n.rings->sq.tail, tail);
601 
602 	tail = READ_ONCE(o.rings->cq.tail);
603 	old_head = READ_ONCE(o.rings->cq.head);
604 	if (tail - old_head > p->cq_entries) {
605 overflow:
606 		/* restore old rings, and return -EOVERFLOW via cleanup path */
607 		ctx->rings = o.rings;
608 		ctx->sq_sqes = o.sq_sqes;
609 		to_free = &n;
610 		ret = -EOVERFLOW;
611 		goto out;
612 	}
613 	for (i = old_head; i < tail; i++) {
614 		unsigned src_head = i & (ctx->cq_entries - 1);
615 		unsigned dst_head = i & (p->cq_entries - 1);
616 
617 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
618 	}
619 	WRITE_ONCE(n.rings->cq.head, old_head);
620 	WRITE_ONCE(n.rings->cq.tail, tail);
621 	/* invalidate cached cqe refill */
622 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
623 
624 	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
625 	atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
626 	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
627 	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
628 
629 	/* all done, store old pointers and assign new ones */
630 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
631 		ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);
632 
633 	ctx->sq_entries = p->sq_entries;
634 	ctx->cq_entries = p->cq_entries;
635 
636 	ctx->rings = n.rings;
637 	ctx->sq_sqes = n.sq_sqes;
638 	swap_old(ctx, o, n, ring_region);
639 	swap_old(ctx, o, n, sq_region);
640 	to_free = &o;
641 	ret = 0;
642 out:
643 	spin_unlock(&ctx->completion_lock);
644 	mutex_unlock(&ctx->mmap_lock);
645 	io_register_free_rings(ctx, to_free);
646 
647 	if (ctx->sq_data)
648 		io_sq_thread_unpark(ctx->sq_data);
649 
650 	return ret;
651 }
652 
653 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
654 {
655 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
656 	struct io_uring_mem_region_reg reg;
657 	struct io_uring_region_desc __user *rd_uptr;
658 	struct io_uring_region_desc rd;
659 	struct io_mapped_region region = {};
660 	int ret;
661 
662 	if (io_region_is_set(&ctx->param_region))
663 		return -EBUSY;
664 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
665 		return -EFAULT;
666 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
667 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
668 		return -EFAULT;
669 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
670 		return -EINVAL;
671 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
672 		return -EINVAL;
673 
674 	/*
675 	 * This ensures there are no waiters. Waiters are unlocked and it's
676 	 * hard to synchronise with them, especially if we need to initialise
677 	 * the region.
678 	 */
679 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
680 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
681 		return -EINVAL;
682 
683 	ret = io_create_region(ctx, &region, &rd, IORING_MAP_OFF_PARAM_REGION);
684 	if (ret)
685 		return ret;
686 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
687 		io_free_region(ctx->user, &region);
688 		return -EFAULT;
689 	}
690 
691 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
692 		ctx->cq_wait_arg = io_region_get_ptr(&region);
693 		ctx->cq_wait_size = rd.size;
694 	}
695 
696 	io_region_publish(ctx, &region, &ctx->param_region);
697 	return 0;
698 }
699 
700 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
701 			       void __user *arg, unsigned nr_args)
702 	__releases(ctx->uring_lock)
703 	__acquires(ctx->uring_lock)
704 {
705 	int ret;
706 
707 	/*
708 	 * We don't quiesce the refs for register anymore and so it can't be
709 	 * dying as we're holding a file ref here.
710 	 */
711 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
712 		return -ENXIO;
713 
714 	if (ctx->submitter_task && ctx->submitter_task != current)
715 		return -EEXIST;
716 
717 	if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
718 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
719 		if (!test_bit(opcode, ctx->restrictions.register_op))
720 			return -EACCES;
721 	}
722 
723 	switch (opcode) {
724 	case IORING_REGISTER_BUFFERS:
725 		ret = -EFAULT;
726 		if (!arg)
727 			break;
728 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
729 		break;
730 	case IORING_UNREGISTER_BUFFERS:
731 		ret = -EINVAL;
732 		if (arg || nr_args)
733 			break;
734 		ret = io_sqe_buffers_unregister(ctx);
735 		break;
736 	case IORING_REGISTER_FILES:
737 		ret = -EFAULT;
738 		if (!arg)
739 			break;
740 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
741 		break;
742 	case IORING_UNREGISTER_FILES:
743 		ret = -EINVAL;
744 		if (arg || nr_args)
745 			break;
746 		ret = io_sqe_files_unregister(ctx);
747 		break;
748 	case IORING_REGISTER_FILES_UPDATE:
749 		ret = io_register_files_update(ctx, arg, nr_args);
750 		break;
751 	case IORING_REGISTER_EVENTFD:
752 		ret = -EINVAL;
753 		if (nr_args != 1)
754 			break;
755 		ret = io_eventfd_register(ctx, arg, 0);
756 		break;
757 	case IORING_REGISTER_EVENTFD_ASYNC:
758 		ret = -EINVAL;
759 		if (nr_args != 1)
760 			break;
761 		ret = io_eventfd_register(ctx, arg, 1);
762 		break;
763 	case IORING_UNREGISTER_EVENTFD:
764 		ret = -EINVAL;
765 		if (arg || nr_args)
766 			break;
767 		ret = io_eventfd_unregister(ctx);
768 		break;
769 	case IORING_REGISTER_PROBE:
770 		ret = -EINVAL;
771 		if (!arg || nr_args > 256)
772 			break;
773 		ret = io_probe(ctx, arg, nr_args);
774 		break;
775 	case IORING_REGISTER_PERSONALITY:
776 		ret = -EINVAL;
777 		if (arg || nr_args)
778 			break;
779 		ret = io_register_personality(ctx);
780 		break;
781 	case IORING_UNREGISTER_PERSONALITY:
782 		ret = -EINVAL;
783 		if (arg)
784 			break;
785 		ret = io_unregister_personality(ctx, nr_args);
786 		break;
787 	case IORING_REGISTER_ENABLE_RINGS:
788 		ret = -EINVAL;
789 		if (arg || nr_args)
790 			break;
791 		ret = io_register_enable_rings(ctx);
792 		break;
793 	case IORING_REGISTER_RESTRICTIONS:
794 		ret = io_register_restrictions(ctx, arg, nr_args);
795 		break;
796 	case IORING_REGISTER_FILES2:
797 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
798 		break;
799 	case IORING_REGISTER_FILES_UPDATE2:
800 		ret = io_register_rsrc_update(ctx, arg, nr_args,
801 					      IORING_RSRC_FILE);
802 		break;
803 	case IORING_REGISTER_BUFFERS2:
804 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
805 		break;
806 	case IORING_REGISTER_BUFFERS_UPDATE:
807 		ret = io_register_rsrc_update(ctx, arg, nr_args,
808 					      IORING_RSRC_BUFFER);
809 		break;
810 	case IORING_REGISTER_IOWQ_AFF:
811 		ret = -EINVAL;
812 		if (!arg || !nr_args)
813 			break;
814 		ret = io_register_iowq_aff(ctx, arg, nr_args);
815 		break;
816 	case IORING_UNREGISTER_IOWQ_AFF:
817 		ret = -EINVAL;
818 		if (arg || nr_args)
819 			break;
820 		ret = io_unregister_iowq_aff(ctx);
821 		break;
822 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
823 		ret = -EINVAL;
824 		if (!arg || nr_args != 2)
825 			break;
826 		ret = io_register_iowq_max_workers(ctx, arg);
827 		break;
828 	case IORING_REGISTER_RING_FDS:
829 		ret = io_ringfd_register(ctx, arg, nr_args);
830 		break;
831 	case IORING_UNREGISTER_RING_FDS:
832 		ret = io_ringfd_unregister(ctx, arg, nr_args);
833 		break;
834 	case IORING_REGISTER_PBUF_RING:
835 		ret = -EINVAL;
836 		if (!arg || nr_args != 1)
837 			break;
838 		ret = io_register_pbuf_ring(ctx, arg);
839 		break;
840 	case IORING_UNREGISTER_PBUF_RING:
841 		ret = -EINVAL;
842 		if (!arg || nr_args != 1)
843 			break;
844 		ret = io_unregister_pbuf_ring(ctx, arg);
845 		break;
846 	case IORING_REGISTER_SYNC_CANCEL:
847 		ret = -EINVAL;
848 		if (!arg || nr_args != 1)
849 			break;
850 		ret = io_sync_cancel(ctx, arg);
851 		break;
852 	case IORING_REGISTER_FILE_ALLOC_RANGE:
853 		ret = -EINVAL;
854 		if (!arg || nr_args)
855 			break;
856 		ret = io_register_file_alloc_range(ctx, arg);
857 		break;
858 	case IORING_REGISTER_PBUF_STATUS:
859 		ret = -EINVAL;
860 		if (!arg || nr_args != 1)
861 			break;
862 		ret = io_register_pbuf_status(ctx, arg);
863 		break;
864 	case IORING_REGISTER_NAPI:
865 		ret = -EINVAL;
866 		if (!arg || nr_args != 1)
867 			break;
868 		ret = io_register_napi(ctx, arg);
869 		break;
870 	case IORING_UNREGISTER_NAPI:
871 		ret = -EINVAL;
872 		if (nr_args != 1)
873 			break;
874 		ret = io_unregister_napi(ctx, arg);
875 		break;
876 	case IORING_REGISTER_CLOCK:
877 		ret = -EINVAL;
878 		if (!arg || nr_args)
879 			break;
880 		ret = io_register_clock(ctx, arg);
881 		break;
882 	case IORING_REGISTER_CLONE_BUFFERS:
883 		ret = -EINVAL;
884 		if (!arg || nr_args != 1)
885 			break;
886 		ret = io_register_clone_buffers(ctx, arg);
887 		break;
888 	case IORING_REGISTER_ZCRX_IFQ:
889 		ret = -EINVAL;
890 		if (!arg || nr_args != 1)
891 			break;
892 		ret = io_register_zcrx_ifq(ctx, arg);
893 		break;
894 	case IORING_REGISTER_RESIZE_RINGS:
895 		ret = -EINVAL;
896 		if (!arg || nr_args != 1)
897 			break;
898 		ret = io_register_resize_rings(ctx, arg);
899 		break;
900 	case IORING_REGISTER_MEM_REGION:
901 		ret = -EINVAL;
902 		if (!arg || nr_args != 1)
903 			break;
904 		ret = io_register_mem_region(ctx, arg);
905 		break;
906 	case IORING_REGISTER_QUERY:
907 		ret = io_query(arg, nr_args);
908 		break;
909 	case IORING_REGISTER_ZCRX_CTRL:
910 		ret = io_zcrx_ctrl(ctx, arg, nr_args);
911 		break;
912 	case IORING_REGISTER_BPF_FILTER:
913 		ret = -EINVAL;
914 
915 		if (nr_args != 1)
916 			break;
917 		ret = io_register_bpf_filter(&ctx->restrictions, arg);
918 		if (!ret)
919 			WRITE_ONCE(ctx->bpf_filters,
920 				   ctx->restrictions.bpf_filters->filters);
921 		break;
922 	default:
923 		ret = -EINVAL;
924 		break;
925 	}
926 
927 	return ret;
928 }
929 
930 /*
931  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
932  * true, then the registered index is used. Otherwise, the normal fd table.
933  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
934  */
935 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
936 {
937 	struct file *file;
938 
939 	if (registered) {
940 		/*
941 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
942 		 * need only dereference our task private array to find it.
943 		 */
944 		struct io_uring_task *tctx = current->io_uring;
945 
946 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
947 			return ERR_PTR(-EINVAL);
948 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
949 		file = tctx->registered_rings[fd];
950 		if (file)
951 			get_file(file);
952 	} else {
953 		file = fget(fd);
954 	}
955 
956 	if (unlikely(!file))
957 		return ERR_PTR(-EBADF);
958 	if (io_is_uring_fops(file))
959 		return file;
960 	fput(file);
961 	return ERR_PTR(-EOPNOTSUPP);
962 }
963 
964 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
965 {
966 	struct io_uring_sqe sqe;
967 
968 	if (!arg || nr_args != 1)
969 		return -EINVAL;
970 	if (copy_from_user(&sqe, arg, sizeof(sqe)))
971 		return -EFAULT;
972 	/* no flags supported */
973 	if (sqe.flags)
974 		return -EINVAL;
975 	if (sqe.opcode != IORING_OP_MSG_RING)
976 		return -EINVAL;
977 
978 	return io_uring_sync_msg_ring(&sqe);
979 }
980 
981 /*
982  * "blind" registration opcodes are ones where there's no ring given, and
983  * hence the source fd must be -1.
984  */
985 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
986 				   unsigned int nr_args)
987 {
988 	switch (opcode) {
989 	case IORING_REGISTER_SEND_MSG_RING:
990 		return io_uring_register_send_msg_ring(arg, nr_args);
991 	case IORING_REGISTER_QUERY:
992 		return io_query(arg, nr_args);
993 	case IORING_REGISTER_RESTRICTIONS:
994 		return io_register_restrictions_task(arg, nr_args);
995 	case IORING_REGISTER_BPF_FILTER:
996 		return io_register_bpf_filter_task(arg, nr_args);
997 	}
998 	return -EINVAL;
999 }
1000 
1001 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
1002 		void __user *, arg, unsigned int, nr_args)
1003 {
1004 	struct io_ring_ctx *ctx;
1005 	long ret = -EBADF;
1006 	struct file *file;
1007 	bool use_registered_ring;
1008 
1009 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
1010 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
1011 
1012 	if (opcode >= IORING_REGISTER_LAST)
1013 		return -EINVAL;
1014 
1015 	if (fd == -1)
1016 		return io_uring_register_blind(opcode, arg, nr_args);
1017 
1018 	file = io_uring_register_get_file(fd, use_registered_ring);
1019 	if (IS_ERR(file))
1020 		return PTR_ERR(file);
1021 	ctx = file->private_data;
1022 
1023 	mutex_lock(&ctx->uring_lock);
1024 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
1025 
1026 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
1027 				ctx->buf_table.nr, ret);
1028 	mutex_unlock(&ctx->uring_lock);
1029 
1030 	fput(file);
1031 	return ret;
1032 }
1033