xref: /linux/io_uring/register.c (revision fa58e6e9000c1cc76a7a0c06ea3e68d728cc4247)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "filetable.h"
22 #include "io_uring.h"
23 #include "opdef.h"
24 #include "tctx.h"
25 #include "rsrc.h"
26 #include "sqpoll.h"
27 #include "register.h"
28 #include "cancel.h"
29 #include "kbuf.h"
30 #include "napi.h"
31 #include "eventfd.h"
32 #include "msg_ring.h"
33 #include "memmap.h"
34 #include "zcrx.h"
35 #include "query.h"
36 #include "bpf_filter.h"
37 
38 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
39 				 IORING_REGISTER_LAST + IORING_OP_LAST)
40 
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)41 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
42 			   unsigned nr_args)
43 {
44 	struct io_uring_probe *p;
45 	size_t size;
46 	int i, ret;
47 
48 	if (nr_args > IORING_OP_LAST)
49 		nr_args = IORING_OP_LAST;
50 
51 	size = struct_size(p, ops, nr_args);
52 	p = memdup_user(arg, size);
53 	if (IS_ERR(p))
54 		return PTR_ERR(p);
55 	ret = -EINVAL;
56 	if (memchr_inv(p, 0, size))
57 		goto out;
58 
59 	p->last_op = IORING_OP_LAST - 1;
60 
61 	for (i = 0; i < nr_args; i++) {
62 		p->ops[i].op = i;
63 		if (io_uring_op_supported(i))
64 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
65 	}
66 	p->ops_len = i;
67 
68 	ret = 0;
69 	if (copy_to_user(arg, p, size))
70 		ret = -EFAULT;
71 out:
72 	kfree(p);
73 	return ret;
74 }
75 
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
77 {
78 	const struct cred *creds;
79 
80 	creds = xa_erase(&ctx->personalities, id);
81 	if (creds) {
82 		put_cred(creds);
83 		return 0;
84 	}
85 
86 	return -EINVAL;
87 }
88 
89 
io_register_personality(struct io_ring_ctx * ctx)90 static int io_register_personality(struct io_ring_ctx *ctx)
91 {
92 	const struct cred *creds;
93 	u32 id;
94 	int ret;
95 
96 	creds = get_current_cred();
97 
98 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
99 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
100 	if (ret < 0) {
101 		put_cred(creds);
102 		return ret;
103 	}
104 	return id;
105 }
106 
107 /*
108  * Returns number of restrictions parsed and added on success, or < 0 for
109  * an error.
110  */
io_parse_restrictions(void __user * arg,unsigned int nr_args,struct io_restriction * restrictions)111 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
112 					struct io_restriction *restrictions)
113 {
114 	struct io_uring_restriction *res;
115 	size_t size;
116 	int i, ret;
117 
118 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
119 		return -EINVAL;
120 
121 	size = array_size(nr_args, sizeof(*res));
122 	if (size == SIZE_MAX)
123 		return -EOVERFLOW;
124 
125 	res = memdup_user(arg, size);
126 	if (IS_ERR(res))
127 		return PTR_ERR(res);
128 
129 	ret = -EINVAL;
130 
131 	for (i = 0; i < nr_args; i++) {
132 		switch (res[i].opcode) {
133 		case IORING_RESTRICTION_REGISTER_OP:
134 			if (res[i].register_op >= IORING_REGISTER_LAST)
135 				goto err;
136 			__set_bit(res[i].register_op, restrictions->register_op);
137 			restrictions->reg_registered = true;
138 			break;
139 		case IORING_RESTRICTION_SQE_OP:
140 			if (res[i].sqe_op >= IORING_OP_LAST)
141 				goto err;
142 			__set_bit(res[i].sqe_op, restrictions->sqe_op);
143 			restrictions->op_registered = true;
144 			break;
145 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
146 			restrictions->sqe_flags_allowed = res[i].sqe_flags;
147 			restrictions->op_registered = true;
148 			break;
149 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
150 			restrictions->sqe_flags_required = res[i].sqe_flags;
151 			restrictions->op_registered = true;
152 			break;
153 		default:
154 			goto err;
155 		}
156 	}
157 	ret = nr_args;
158 	if (!nr_args) {
159 		restrictions->op_registered = true;
160 		restrictions->reg_registered = true;
161 	}
162 err:
163 	kfree(res);
164 	return ret;
165 }
166 
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)167 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
168 					   void __user *arg, unsigned int nr_args)
169 {
170 	int ret;
171 
172 	/* Restrictions allowed only if rings started disabled */
173 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
174 		return -EBADFD;
175 
176 	/* We allow only a single restrictions registration */
177 	if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered)
178 		return -EBUSY;
179 
180 	ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
181 	/*
182 	 * Reset all restrictions if an error happened, but retain any COW'ed
183 	 * settings.
184 	 */
185 	if (ret < 0) {
186 		struct io_bpf_filters *bpf = ctx->restrictions.bpf_filters;
187 		bool cowed = ctx->restrictions.bpf_filters_cow;
188 
189 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
190 		ctx->restrictions.bpf_filters = bpf;
191 		ctx->restrictions.bpf_filters_cow = cowed;
192 		return ret;
193 	}
194 	if (ctx->restrictions.op_registered)
195 		ctx->int_flags |= IO_RING_F_OP_RESTRICTED;
196 	if (ctx->restrictions.reg_registered)
197 		ctx->int_flags |= IO_RING_F_REG_RESTRICTED;
198 	return 0;
199 }
200 
io_register_restrictions_task(void __user * arg,unsigned int nr_args)201 static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
202 {
203 	struct io_uring_task_restriction __user *ures = arg;
204 	struct io_uring_task_restriction tres;
205 	struct io_restriction *res;
206 	int ret;
207 
208 	/* Disallow if task already has registered restrictions */
209 	if (current->io_uring_restrict)
210 		return -EPERM;
211 	/*
212 	 * Similar to seccomp, disallow setting a filter if task_no_new_privs
213 	 * is false and we're not CAP_SYS_ADMIN.
214 	 */
215 	if (!task_no_new_privs(current) &&
216 	    !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
217 		return -EACCES;
218 	if (nr_args != 1)
219 		return -EINVAL;
220 
221 	if (copy_from_user(&tres, arg, sizeof(tres)))
222 		return -EFAULT;
223 
224 	if (tres.flags)
225 		return -EINVAL;
226 	if (!mem_is_zero(tres.resv, sizeof(tres.resv)))
227 		return -EINVAL;
228 
229 	res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
230 	if (!res)
231 		return -ENOMEM;
232 
233 	ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res);
234 	if (ret < 0) {
235 		kfree(res);
236 		return ret;
237 	}
238 	current->io_uring_restrict = res;
239 	return 0;
240 }
241 
io_register_bpf_filter_task(void __user * arg,unsigned int nr_args)242 static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
243 {
244 	struct io_restriction *res;
245 	int ret;
246 
247 	/*
248 	 * Similar to seccomp, disallow setting a filter if task_no_new_privs
249 	 * is false and we're not CAP_SYS_ADMIN.
250 	 */
251 	if (!task_no_new_privs(current) &&
252 	    !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
253 		return -EACCES;
254 
255 	if (nr_args != 1)
256 		return -EINVAL;
257 
258 	/* If no task restrictions exist, setup a new set */
259 	res = current->io_uring_restrict;
260 	if (!res) {
261 		res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
262 		if (!res)
263 			return -ENOMEM;
264 	}
265 
266 	ret = io_register_bpf_filter(res, arg);
267 	if (ret) {
268 		if (res != current->io_uring_restrict)
269 			kfree(res);
270 		return ret;
271 	}
272 	if (!current->io_uring_restrict)
273 		current->io_uring_restrict = res;
274 	return 0;
275 }
276 
io_register_enable_rings(struct io_ring_ctx * ctx)277 static int io_register_enable_rings(struct io_ring_ctx *ctx)
278 {
279 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
280 		return -EBADFD;
281 
282 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) {
283 		ctx->submitter_task = get_task_struct(current);
284 		/*
285 		 * Lazy activation attempts would fail if it was polled before
286 		 * submitter_task is set.
287 		 */
288 		if (wq_has_sleeper(&ctx->poll_wq))
289 			io_activate_pollwq(ctx);
290 	}
291 
292 	/* Keep submitter_task store before clearing IORING_SETUP_R_DISABLED */
293 	smp_store_release(&ctx->flags, ctx->flags & ~IORING_SETUP_R_DISABLED);
294 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
295 		wake_up(&ctx->sq_data->wait);
296 	return 0;
297 }
298 
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)299 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
300 					 cpumask_var_t new_mask)
301 {
302 	int ret;
303 
304 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
305 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
306 	} else {
307 		mutex_unlock(&ctx->uring_lock);
308 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
309 		mutex_lock(&ctx->uring_lock);
310 	}
311 
312 	return ret;
313 }
314 
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)315 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
316 				       void __user *arg, unsigned len)
317 {
318 	cpumask_var_t new_mask;
319 	int ret;
320 
321 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
322 		return -ENOMEM;
323 
324 	cpumask_clear(new_mask);
325 	if (len > cpumask_size())
326 		len = cpumask_size();
327 
328 #ifdef CONFIG_COMPAT
329 	if (in_compat_syscall())
330 		ret = compat_get_bitmap(cpumask_bits(new_mask),
331 					(const compat_ulong_t __user *)arg,
332 					len * 8 /* CHAR_BIT */);
333 	else
334 #endif
335 		ret = copy_from_user(new_mask, arg, len);
336 
337 	if (ret) {
338 		free_cpumask_var(new_mask);
339 		return -EFAULT;
340 	}
341 
342 	ret = __io_register_iowq_aff(ctx, new_mask);
343 	free_cpumask_var(new_mask);
344 	return ret;
345 }
346 
io_unregister_iowq_aff(struct io_ring_ctx * ctx)347 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
348 {
349 	return __io_register_iowq_aff(ctx, NULL);
350 }
351 
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)352 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
353 					       void __user *arg)
354 	__must_hold(&ctx->uring_lock)
355 {
356 	struct io_tctx_node *node;
357 	struct io_uring_task *tctx = NULL;
358 	struct io_sq_data *sqd = NULL;
359 	__u32 new_count[2];
360 	int i, ret;
361 
362 	if (copy_from_user(new_count, arg, sizeof(new_count)))
363 		return -EFAULT;
364 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
365 		if (new_count[i] > INT_MAX)
366 			return -EINVAL;
367 
368 	if (ctx->flags & IORING_SETUP_SQPOLL) {
369 		sqd = ctx->sq_data;
370 		if (sqd) {
371 			struct task_struct *tsk;
372 
373 			/*
374 			 * Observe the correct sqd->lock -> ctx->uring_lock
375 			 * ordering. Fine to drop uring_lock here, we hold
376 			 * a ref to the ctx.
377 			 */
378 			refcount_inc(&sqd->refs);
379 			mutex_unlock(&ctx->uring_lock);
380 			mutex_lock(&sqd->lock);
381 			mutex_lock(&ctx->uring_lock);
382 			tsk = sqpoll_task_locked(sqd);
383 			if (tsk)
384 				tctx = tsk->io_uring;
385 		}
386 	} else {
387 		tctx = current->io_uring;
388 	}
389 
390 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
391 
392 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
393 		if (new_count[i])
394 			ctx->iowq_limits[i] = new_count[i];
395 	ctx->int_flags |= IO_RING_F_IOWQ_LIMITS_SET;
396 
397 	if (tctx && tctx->io_wq) {
398 		ret = io_wq_max_workers(tctx->io_wq, new_count);
399 		if (ret)
400 			goto err;
401 	} else {
402 		memset(new_count, 0, sizeof(new_count));
403 	}
404 
405 	if (sqd) {
406 		mutex_unlock(&ctx->uring_lock);
407 		mutex_unlock(&sqd->lock);
408 		io_put_sq_data(sqd);
409 		mutex_lock(&ctx->uring_lock);
410 	}
411 
412 	if (copy_to_user(arg, new_count, sizeof(new_count)))
413 		return -EFAULT;
414 
415 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
416 	if (sqd)
417 		return 0;
418 
419 	/* now propagate the restriction to all registered users */
420 	mutex_lock(&ctx->tctx_lock);
421 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
422 		tctx = node->task->io_uring;
423 		if (WARN_ON_ONCE(!tctx->io_wq))
424 			continue;
425 
426 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
427 			new_count[i] = ctx->iowq_limits[i];
428 		/* ignore errors, it always returns zero anyway */
429 		(void)io_wq_max_workers(tctx->io_wq, new_count);
430 	}
431 	mutex_unlock(&ctx->tctx_lock);
432 	return 0;
433 err:
434 	if (sqd) {
435 		mutex_unlock(&ctx->uring_lock);
436 		mutex_unlock(&sqd->lock);
437 		io_put_sq_data(sqd);
438 		mutex_lock(&ctx->uring_lock);
439 	}
440 	return ret;
441 }
442 
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)443 static int io_register_clock(struct io_ring_ctx *ctx,
444 			     struct io_uring_clock_register __user *arg)
445 {
446 	struct io_uring_clock_register reg;
447 
448 	if (copy_from_user(&reg, arg, sizeof(reg)))
449 		return -EFAULT;
450 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
451 		return -EINVAL;
452 
453 	switch (reg.clockid) {
454 	case CLOCK_MONOTONIC:
455 		ctx->clock_offset = 0;
456 		break;
457 	case CLOCK_BOOTTIME:
458 		ctx->clock_offset = TK_OFFS_BOOT;
459 		break;
460 	default:
461 		return -EINVAL;
462 	}
463 
464 	ctx->clockid = reg.clockid;
465 	return 0;
466 }
467 
468 /*
469  * State to maintain until we can swap. Both new and old state, used for
470  * either mapping or freeing.
471  */
472 struct io_ring_ctx_rings {
473 	struct io_rings *rings;
474 	struct io_uring_sqe *sq_sqes;
475 
476 	struct io_mapped_region sq_region;
477 	struct io_mapped_region ring_region;
478 };
479 
io_register_free_rings(struct io_ring_ctx * ctx,struct io_ring_ctx_rings * r)480 static void io_register_free_rings(struct io_ring_ctx *ctx,
481 				   struct io_ring_ctx_rings *r)
482 {
483 	io_free_region(ctx->user, &r->sq_region);
484 	io_free_region(ctx->user, &r->ring_region);
485 }
486 
487 #define swap_old(ctx, o, n, field)		\
488 	do {					\
489 		(o).field = (ctx)->field;	\
490 		(ctx)->field = (n).field;	\
491 	} while (0)
492 
493 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
494 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
495 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
496 			 IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
497 
io_register_resize_rings(struct io_ring_ctx * ctx,void __user * arg)498 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
499 {
500 	struct io_ctx_config config;
501 	struct io_uring_region_desc rd;
502 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
503 	unsigned i, tail, old_head;
504 	struct io_uring_params *p = &config.p;
505 	struct io_rings_layout *rl = &config.layout;
506 	int ret;
507 
508 	memset(&config, 0, sizeof(config));
509 
510 	/* limited to DEFER_TASKRUN for now */
511 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
512 		return -EINVAL;
513 	if (copy_from_user(p, arg, sizeof(*p)))
514 		return -EFAULT;
515 	if (p->flags & ~RESIZE_FLAGS)
516 		return -EINVAL;
517 
518 	/* properties that are always inherited */
519 	p->flags |= (ctx->flags & COPY_FLAGS);
520 
521 	ret = io_prepare_config(&config);
522 	if (unlikely(ret))
523 		return ret;
524 
525 	memset(&rd, 0, sizeof(rd));
526 	rd.size = PAGE_ALIGN(rl->rings_size);
527 	if (p->flags & IORING_SETUP_NO_MMAP) {
528 		rd.user_addr = p->cq_off.user_addr;
529 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
530 	}
531 	ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
532 	if (ret)
533 		return ret;
534 
535 	n.rings = io_region_get_ptr(&n.ring_region);
536 
537 	/*
538 	 * At this point n.rings is shared with userspace, just like o.rings
539 	 * is as well. While we don't expect userspace to modify it while
540 	 * a resize is in progress, and it's most likely that userspace will
541 	 * shoot itself in the foot if it does, we can't always assume good
542 	 * intent... Use read/write once helpers from here on to indicate the
543 	 * shared nature of it.
544 	 */
545 	WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1);
546 	WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1);
547 	WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries);
548 	WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries);
549 
550 	if (copy_to_user(arg, p, sizeof(*p))) {
551 		io_register_free_rings(ctx, &n);
552 		return -EFAULT;
553 	}
554 
555 	memset(&rd, 0, sizeof(rd));
556 	rd.size = PAGE_ALIGN(rl->sq_size);
557 	if (p->flags & IORING_SETUP_NO_MMAP) {
558 		rd.user_addr = p->sq_off.user_addr;
559 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
560 	}
561 	ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
562 	if (ret) {
563 		io_register_free_rings(ctx, &n);
564 		return ret;
565 	}
566 	n.sq_sqes = io_region_get_ptr(&n.sq_region);
567 
568 	/*
569 	 * If using SQPOLL, park the thread
570 	 */
571 	if (ctx->sq_data) {
572 		mutex_unlock(&ctx->uring_lock);
573 		io_sq_thread_park(ctx->sq_data);
574 		mutex_lock(&ctx->uring_lock);
575 	}
576 
577 	/*
578 	 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
579 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
580 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
581 	 * existing rings beyond this point will fail. Not that it could proceed
582 	 * at this point anyway, as the io_uring mmap side needs go grab the
583 	 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
584 	 * duration of the actual swap.
585 	 */
586 	mutex_lock(&ctx->mmap_lock);
587 	spin_lock(&ctx->completion_lock);
588 	o.rings = ctx->rings;
589 	ctx->rings = NULL;
590 	o.sq_sqes = ctx->sq_sqes;
591 	ctx->sq_sqes = NULL;
592 
593 	/*
594 	 * Now copy SQ and CQ entries, if any. If either of the destination
595 	 * rings can't hold what is already there, then fail the operation.
596 	 */
597 	tail = READ_ONCE(o.rings->sq.tail);
598 	old_head = READ_ONCE(o.rings->sq.head);
599 	if (tail - old_head > p->sq_entries)
600 		goto overflow;
601 	for (i = old_head; i < tail; i++) {
602 		unsigned index, dst_mask, src_mask;
603 		size_t sq_size;
604 
605 		index = i;
606 		sq_size = sizeof(struct io_uring_sqe);
607 		src_mask = ctx->sq_entries - 1;
608 		dst_mask = p->sq_entries - 1;
609 		if (ctx->flags & IORING_SETUP_SQE128) {
610 			index <<= 1;
611 			sq_size <<= 1;
612 			src_mask = (ctx->sq_entries << 1) - 1;
613 			dst_mask = (p->sq_entries << 1) - 1;
614 		}
615 		memcpy(&n.sq_sqes[index & dst_mask], &o.sq_sqes[index & src_mask], sq_size);
616 	}
617 	WRITE_ONCE(n.rings->sq.head, old_head);
618 	WRITE_ONCE(n.rings->sq.tail, tail);
619 
620 	tail = READ_ONCE(o.rings->cq.tail);
621 	old_head = READ_ONCE(o.rings->cq.head);
622 	if (tail - old_head > p->cq_entries) {
623 overflow:
624 		/* restore old rings, and return -EOVERFLOW via cleanup path */
625 		ctx->rings = o.rings;
626 		ctx->sq_sqes = o.sq_sqes;
627 		to_free = &n;
628 		ret = -EOVERFLOW;
629 		goto out;
630 	}
631 	for (i = old_head; i < tail; i++) {
632 		unsigned index, dst_mask, src_mask;
633 		size_t cq_size;
634 
635 		index = i;
636 		cq_size = sizeof(struct io_uring_cqe);
637 		src_mask = ctx->cq_entries - 1;
638 		dst_mask = p->cq_entries - 1;
639 		if (ctx->flags & IORING_SETUP_CQE32) {
640 			index <<= 1;
641 			cq_size <<= 1;
642 			src_mask = (ctx->cq_entries << 1) - 1;
643 			dst_mask = (p->cq_entries << 1) - 1;
644 		}
645 		memcpy(&n.rings->cqes[index & dst_mask], &o.rings->cqes[index & src_mask], cq_size);
646 	}
647 	WRITE_ONCE(n.rings->cq.head, old_head);
648 	WRITE_ONCE(n.rings->cq.tail, tail);
649 	/* invalidate cached cqe refill */
650 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
651 
652 	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
653 	atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
654 	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
655 	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
656 
657 	/* all done, store old pointers and assign new ones */
658 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
659 		ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);
660 
661 	ctx->sq_entries = p->sq_entries;
662 	ctx->cq_entries = p->cq_entries;
663 
664 	/*
665 	 * Just mark any flag we may have missed and that the application
666 	 * should act on unconditionally. Worst case it'll be an extra
667 	 * syscall.
668 	 */
669 	atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags);
670 	ctx->rings = n.rings;
671 	rcu_assign_pointer(ctx->rings_rcu, n.rings);
672 
673 	ctx->sq_sqes = n.sq_sqes;
674 	swap_old(ctx, o, n, ring_region);
675 	swap_old(ctx, o, n, sq_region);
676 	to_free = &o;
677 	ret = 0;
678 out:
679 	spin_unlock(&ctx->completion_lock);
680 	mutex_unlock(&ctx->mmap_lock);
681 	/* Wait for concurrent io_ctx_mark_taskrun() */
682 	if (to_free == &o)
683 		synchronize_rcu_expedited();
684 	io_register_free_rings(ctx, to_free);
685 
686 	if (ctx->sq_data)
687 		io_sq_thread_unpark(ctx->sq_data);
688 
689 	return ret;
690 }
691 
io_register_mem_region(struct io_ring_ctx * ctx,void __user * uarg)692 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
693 {
694 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
695 	struct io_uring_mem_region_reg reg;
696 	struct io_uring_region_desc __user *rd_uptr;
697 	struct io_uring_region_desc rd;
698 	struct io_mapped_region region = {};
699 	int ret;
700 
701 	if (io_region_is_set(&ctx->param_region))
702 		return -EBUSY;
703 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
704 		return -EFAULT;
705 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
706 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
707 		return -EFAULT;
708 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
709 		return -EINVAL;
710 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
711 		return -EINVAL;
712 
713 	/*
714 	 * This ensures there are no waiters. Waiters are unlocked and it's
715 	 * hard to synchronise with them, especially if we need to initialise
716 	 * the region.
717 	 */
718 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
719 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
720 		return -EINVAL;
721 
722 	ret = io_create_region(ctx, &region, &rd, IORING_MAP_OFF_PARAM_REGION);
723 	if (ret)
724 		return ret;
725 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
726 		io_free_region(ctx->user, &region);
727 		return -EFAULT;
728 	}
729 
730 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
731 		ctx->cq_wait_arg = io_region_get_ptr(&region);
732 		ctx->cq_wait_size = rd.size;
733 	}
734 
735 	io_region_publish(ctx, &region, &ctx->param_region);
736 	return 0;
737 }
738 
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)739 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
740 			       void __user *arg, unsigned nr_args)
741 	__releases(ctx->uring_lock)
742 	__acquires(ctx->uring_lock)
743 {
744 	int ret;
745 
746 	/*
747 	 * We don't quiesce the refs for register anymore and so it can't be
748 	 * dying as we're holding a file ref here.
749 	 */
750 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
751 		return -ENXIO;
752 
753 	if (ctx->submitter_task && ctx->submitter_task != current)
754 		return -EEXIST;
755 
756 	if ((ctx->int_flags & IO_RING_F_REG_RESTRICTED) && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
757 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
758 		if (!test_bit(opcode, ctx->restrictions.register_op))
759 			return -EACCES;
760 	}
761 
762 	switch (opcode) {
763 	case IORING_REGISTER_BUFFERS:
764 		ret = -EFAULT;
765 		if (!arg)
766 			break;
767 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
768 		break;
769 	case IORING_UNREGISTER_BUFFERS:
770 		ret = -EINVAL;
771 		if (arg || nr_args)
772 			break;
773 		ret = io_sqe_buffers_unregister(ctx);
774 		break;
775 	case IORING_REGISTER_FILES:
776 		ret = -EFAULT;
777 		if (!arg)
778 			break;
779 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
780 		break;
781 	case IORING_UNREGISTER_FILES:
782 		ret = -EINVAL;
783 		if (arg || nr_args)
784 			break;
785 		ret = io_sqe_files_unregister(ctx);
786 		break;
787 	case IORING_REGISTER_FILES_UPDATE:
788 		ret = io_register_files_update(ctx, arg, nr_args);
789 		break;
790 	case IORING_REGISTER_EVENTFD:
791 		ret = -EINVAL;
792 		if (nr_args != 1)
793 			break;
794 		ret = io_eventfd_register(ctx, arg, 0);
795 		break;
796 	case IORING_REGISTER_EVENTFD_ASYNC:
797 		ret = -EINVAL;
798 		if (nr_args != 1)
799 			break;
800 		ret = io_eventfd_register(ctx, arg, 1);
801 		break;
802 	case IORING_UNREGISTER_EVENTFD:
803 		ret = -EINVAL;
804 		if (arg || nr_args)
805 			break;
806 		ret = io_eventfd_unregister(ctx);
807 		break;
808 	case IORING_REGISTER_PROBE:
809 		ret = -EINVAL;
810 		if (!arg || nr_args > 256)
811 			break;
812 		ret = io_probe(ctx, arg, nr_args);
813 		break;
814 	case IORING_REGISTER_PERSONALITY:
815 		ret = -EINVAL;
816 		if (arg || nr_args)
817 			break;
818 		ret = io_register_personality(ctx);
819 		break;
820 	case IORING_UNREGISTER_PERSONALITY:
821 		ret = -EINVAL;
822 		if (arg)
823 			break;
824 		ret = io_unregister_personality(ctx, nr_args);
825 		break;
826 	case IORING_REGISTER_ENABLE_RINGS:
827 		ret = -EINVAL;
828 		if (arg || nr_args)
829 			break;
830 		ret = io_register_enable_rings(ctx);
831 		break;
832 	case IORING_REGISTER_RESTRICTIONS:
833 		ret = io_register_restrictions(ctx, arg, nr_args);
834 		break;
835 	case IORING_REGISTER_FILES2:
836 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
837 		break;
838 	case IORING_REGISTER_FILES_UPDATE2:
839 		ret = io_register_rsrc_update(ctx, arg, nr_args,
840 					      IORING_RSRC_FILE);
841 		break;
842 	case IORING_REGISTER_BUFFERS2:
843 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
844 		break;
845 	case IORING_REGISTER_BUFFERS_UPDATE:
846 		ret = io_register_rsrc_update(ctx, arg, nr_args,
847 					      IORING_RSRC_BUFFER);
848 		break;
849 	case IORING_REGISTER_IOWQ_AFF:
850 		ret = -EINVAL;
851 		if (!arg || !nr_args)
852 			break;
853 		ret = io_register_iowq_aff(ctx, arg, nr_args);
854 		break;
855 	case IORING_UNREGISTER_IOWQ_AFF:
856 		ret = -EINVAL;
857 		if (arg || nr_args)
858 			break;
859 		ret = io_unregister_iowq_aff(ctx);
860 		break;
861 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
862 		ret = -EINVAL;
863 		if (!arg || nr_args != 2)
864 			break;
865 		ret = io_register_iowq_max_workers(ctx, arg);
866 		break;
867 	case IORING_REGISTER_RING_FDS:
868 		ret = io_ringfd_register(ctx, arg, nr_args);
869 		break;
870 	case IORING_UNREGISTER_RING_FDS:
871 		ret = io_ringfd_unregister(ctx, arg, nr_args);
872 		break;
873 	case IORING_REGISTER_PBUF_RING:
874 		ret = -EINVAL;
875 		if (!arg || nr_args != 1)
876 			break;
877 		ret = io_register_pbuf_ring(ctx, arg);
878 		break;
879 	case IORING_UNREGISTER_PBUF_RING:
880 		ret = -EINVAL;
881 		if (!arg || nr_args != 1)
882 			break;
883 		ret = io_unregister_pbuf_ring(ctx, arg);
884 		break;
885 	case IORING_REGISTER_SYNC_CANCEL:
886 		ret = -EINVAL;
887 		if (!arg || nr_args != 1)
888 			break;
889 		ret = io_sync_cancel(ctx, arg);
890 		break;
891 	case IORING_REGISTER_FILE_ALLOC_RANGE:
892 		ret = -EINVAL;
893 		if (!arg || nr_args)
894 			break;
895 		ret = io_register_file_alloc_range(ctx, arg);
896 		break;
897 	case IORING_REGISTER_PBUF_STATUS:
898 		ret = -EINVAL;
899 		if (!arg || nr_args != 1)
900 			break;
901 		ret = io_register_pbuf_status(ctx, arg);
902 		break;
903 	case IORING_REGISTER_NAPI:
904 		ret = -EINVAL;
905 		if (!arg || nr_args != 1)
906 			break;
907 		ret = io_register_napi(ctx, arg);
908 		break;
909 	case IORING_UNREGISTER_NAPI:
910 		ret = -EINVAL;
911 		if (nr_args != 1)
912 			break;
913 		ret = io_unregister_napi(ctx, arg);
914 		break;
915 	case IORING_REGISTER_CLOCK:
916 		ret = -EINVAL;
917 		if (!arg || nr_args)
918 			break;
919 		ret = io_register_clock(ctx, arg);
920 		break;
921 	case IORING_REGISTER_CLONE_BUFFERS:
922 		ret = -EINVAL;
923 		if (!arg || nr_args != 1)
924 			break;
925 		ret = io_register_clone_buffers(ctx, arg);
926 		break;
927 	case IORING_REGISTER_ZCRX_IFQ:
928 		ret = -EINVAL;
929 		if (!arg || nr_args != 1)
930 			break;
931 		ret = io_register_zcrx(ctx, arg);
932 		break;
933 	case IORING_REGISTER_RESIZE_RINGS:
934 		ret = -EINVAL;
935 		if (!arg || nr_args != 1)
936 			break;
937 		ret = io_register_resize_rings(ctx, arg);
938 		break;
939 	case IORING_REGISTER_MEM_REGION:
940 		ret = -EINVAL;
941 		if (!arg || nr_args != 1)
942 			break;
943 		ret = io_register_mem_region(ctx, arg);
944 		break;
945 	case IORING_REGISTER_QUERY:
946 		ret = io_query(arg, nr_args);
947 		break;
948 	case IORING_REGISTER_ZCRX_CTRL:
949 		ret = io_zcrx_ctrl(ctx, arg, nr_args);
950 		break;
951 	case IORING_REGISTER_BPF_FILTER:
952 		ret = -EINVAL;
953 
954 		if (nr_args != 1)
955 			break;
956 		ret = io_register_bpf_filter(&ctx->restrictions, arg);
957 		if (!ret)
958 			WRITE_ONCE(ctx->bpf_filters,
959 				   ctx->restrictions.bpf_filters->filters);
960 		break;
961 	default:
962 		ret = -EINVAL;
963 		break;
964 	}
965 
966 	return ret;
967 }
968 
io_uring_register_send_msg_ring(void __user * arg,unsigned int nr_args)969 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
970 {
971 	struct io_uring_sqe sqe;
972 
973 	if (!arg || nr_args != 1)
974 		return -EINVAL;
975 	if (copy_from_user(&sqe, arg, sizeof(sqe)))
976 		return -EFAULT;
977 	/* no flags supported */
978 	if (sqe.flags)
979 		return -EINVAL;
980 	if (sqe.opcode != IORING_OP_MSG_RING)
981 		return -EINVAL;
982 
983 	return io_uring_sync_msg_ring(&sqe);
984 }
985 
986 /*
987  * "blind" registration opcodes are ones where there's no ring given, and
988  * hence the source fd must be -1.
989  */
io_uring_register_blind(unsigned int opcode,void __user * arg,unsigned int nr_args)990 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
991 				   unsigned int nr_args)
992 {
993 	switch (opcode) {
994 	case IORING_REGISTER_SEND_MSG_RING:
995 		return io_uring_register_send_msg_ring(arg, nr_args);
996 	case IORING_REGISTER_QUERY:
997 		return io_query(arg, nr_args);
998 	case IORING_REGISTER_RESTRICTIONS:
999 		return io_register_restrictions_task(arg, nr_args);
1000 	case IORING_REGISTER_BPF_FILTER:
1001 		return io_register_bpf_filter_task(arg, nr_args);
1002 	}
1003 	return -EINVAL;
1004 }
1005 
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)1006 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
1007 		void __user *, arg, unsigned int, nr_args)
1008 {
1009 	struct io_ring_ctx *ctx;
1010 	long ret = -EBADF;
1011 	struct file *file;
1012 	bool use_registered_ring;
1013 
1014 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
1015 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
1016 
1017 	if (opcode >= IORING_REGISTER_LAST)
1018 		return -EINVAL;
1019 
1020 	if (fd == -1)
1021 		return io_uring_register_blind(opcode, arg, nr_args);
1022 
1023 	file = io_uring_ctx_get_file(fd, use_registered_ring);
1024 	if (IS_ERR(file))
1025 		return PTR_ERR(file);
1026 	ctx = file->private_data;
1027 
1028 	mutex_lock(&ctx->uring_lock);
1029 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
1030 
1031 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
1032 				ctx->buf_table.nr, ret);
1033 	mutex_unlock(&ctx->uring_lock);
1034 
1035 	if (!use_registered_ring)
1036 		fput(file);
1037 	return ret;
1038 }
1039