1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Code related to the io_uring_register() syscall
4 *
5 * Copyright (C) 2023 Jens Axboe
6 */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20
21 #include "filetable.h"
22 #include "io_uring.h"
23 #include "opdef.h"
24 #include "tctx.h"
25 #include "rsrc.h"
26 #include "sqpoll.h"
27 #include "register.h"
28 #include "cancel.h"
29 #include "kbuf.h"
30 #include "napi.h"
31 #include "eventfd.h"
32 #include "msg_ring.h"
33 #include "memmap.h"
34 #include "zcrx.h"
35 #include "query.h"
36 #include "bpf_filter.h"
37
38 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
39 IORING_REGISTER_LAST + IORING_OP_LAST)
40
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)41 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
42 unsigned nr_args)
43 {
44 struct io_uring_probe *p;
45 size_t size;
46 int i, ret;
47
48 if (nr_args > IORING_OP_LAST)
49 nr_args = IORING_OP_LAST;
50
51 size = struct_size(p, ops, nr_args);
52 p = memdup_user(arg, size);
53 if (IS_ERR(p))
54 return PTR_ERR(p);
55 ret = -EINVAL;
56 if (memchr_inv(p, 0, size))
57 goto out;
58
59 p->last_op = IORING_OP_LAST - 1;
60
61 for (i = 0; i < nr_args; i++) {
62 p->ops[i].op = i;
63 if (io_uring_op_supported(i))
64 p->ops[i].flags = IO_URING_OP_SUPPORTED;
65 }
66 p->ops_len = i;
67
68 ret = 0;
69 if (copy_to_user(arg, p, size))
70 ret = -EFAULT;
71 out:
72 kfree(p);
73 return ret;
74 }
75
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
77 {
78 const struct cred *creds;
79
80 creds = xa_erase(&ctx->personalities, id);
81 if (creds) {
82 put_cred(creds);
83 return 0;
84 }
85
86 return -EINVAL;
87 }
88
89
io_register_personality(struct io_ring_ctx * ctx)90 static int io_register_personality(struct io_ring_ctx *ctx)
91 {
92 const struct cred *creds;
93 u32 id;
94 int ret;
95
96 creds = get_current_cred();
97
98 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
99 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
100 if (ret < 0) {
101 put_cred(creds);
102 return ret;
103 }
104 return id;
105 }
106
107 /*
108 * Returns number of restrictions parsed and added on success, or < 0 for
109 * an error.
110 */
io_parse_restrictions(void __user * arg,unsigned int nr_args,struct io_restriction * restrictions)111 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
112 struct io_restriction *restrictions)
113 {
114 struct io_uring_restriction *res;
115 size_t size;
116 int i, ret;
117
118 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
119 return -EINVAL;
120
121 size = array_size(nr_args, sizeof(*res));
122 if (size == SIZE_MAX)
123 return -EOVERFLOW;
124
125 res = memdup_user(arg, size);
126 if (IS_ERR(res))
127 return PTR_ERR(res);
128
129 ret = -EINVAL;
130
131 for (i = 0; i < nr_args; i++) {
132 switch (res[i].opcode) {
133 case IORING_RESTRICTION_REGISTER_OP:
134 if (res[i].register_op >= IORING_REGISTER_LAST)
135 goto err;
136 __set_bit(res[i].register_op, restrictions->register_op);
137 restrictions->reg_registered = true;
138 break;
139 case IORING_RESTRICTION_SQE_OP:
140 if (res[i].sqe_op >= IORING_OP_LAST)
141 goto err;
142 __set_bit(res[i].sqe_op, restrictions->sqe_op);
143 restrictions->op_registered = true;
144 break;
145 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
146 restrictions->sqe_flags_allowed = res[i].sqe_flags;
147 restrictions->op_registered = true;
148 break;
149 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
150 restrictions->sqe_flags_required = res[i].sqe_flags;
151 restrictions->op_registered = true;
152 break;
153 default:
154 goto err;
155 }
156 }
157 ret = nr_args;
158 if (!nr_args) {
159 restrictions->op_registered = true;
160 restrictions->reg_registered = true;
161 }
162 err:
163 kfree(res);
164 return ret;
165 }
166
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)167 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
168 void __user *arg, unsigned int nr_args)
169 {
170 int ret;
171
172 /* Restrictions allowed only if rings started disabled */
173 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
174 return -EBADFD;
175
176 /* We allow only a single restrictions registration */
177 if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered)
178 return -EBUSY;
179
180 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
181 /* Reset all restrictions if an error happened */
182 if (ret < 0) {
183 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
184 return ret;
185 }
186 if (ctx->restrictions.op_registered)
187 ctx->op_restricted = 1;
188 if (ctx->restrictions.reg_registered)
189 ctx->reg_restricted = 1;
190 return 0;
191 }
192
io_register_restrictions_task(void __user * arg,unsigned int nr_args)193 static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
194 {
195 struct io_uring_task_restriction __user *ures = arg;
196 struct io_uring_task_restriction tres;
197 struct io_restriction *res;
198 int ret;
199
200 /* Disallow if task already has registered restrictions */
201 if (current->io_uring_restrict)
202 return -EPERM;
203 /*
204 * Similar to seccomp, disallow setting a filter if task_no_new_privs
205 * is false and we're not CAP_SYS_ADMIN.
206 */
207 if (!task_no_new_privs(current) &&
208 !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
209 return -EACCES;
210 if (nr_args != 1)
211 return -EINVAL;
212
213 if (copy_from_user(&tres, arg, sizeof(tres)))
214 return -EFAULT;
215
216 if (tres.flags)
217 return -EINVAL;
218 if (!mem_is_zero(tres.resv, sizeof(tres.resv)))
219 return -EINVAL;
220
221 res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
222 if (!res)
223 return -ENOMEM;
224
225 ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res);
226 if (ret < 0) {
227 kfree(res);
228 return ret;
229 }
230 current->io_uring_restrict = res;
231 return 0;
232 }
233
io_register_bpf_filter_task(void __user * arg,unsigned int nr_args)234 static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
235 {
236 struct io_restriction *res;
237 int ret;
238
239 /*
240 * Similar to seccomp, disallow setting a filter if task_no_new_privs
241 * is false and we're not CAP_SYS_ADMIN.
242 */
243 if (!task_no_new_privs(current) &&
244 !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
245 return -EACCES;
246
247 if (nr_args != 1)
248 return -EINVAL;
249
250 /* If no task restrictions exist, setup a new set */
251 res = current->io_uring_restrict;
252 if (!res) {
253 res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
254 if (!res)
255 return -ENOMEM;
256 }
257
258 ret = io_register_bpf_filter(res, arg);
259 if (ret) {
260 if (res != current->io_uring_restrict)
261 kfree(res);
262 return ret;
263 }
264 if (!current->io_uring_restrict)
265 current->io_uring_restrict = res;
266 return 0;
267 }
268
io_register_enable_rings(struct io_ring_ctx * ctx)269 static int io_register_enable_rings(struct io_ring_ctx *ctx)
270 {
271 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
272 return -EBADFD;
273
274 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) {
275 ctx->submitter_task = get_task_struct(current);
276 /*
277 * Lazy activation attempts would fail if it was polled before
278 * submitter_task is set.
279 */
280 if (wq_has_sleeper(&ctx->poll_wq))
281 io_activate_pollwq(ctx);
282 }
283
284 /* Keep submitter_task store before clearing IORING_SETUP_R_DISABLED */
285 smp_store_release(&ctx->flags, ctx->flags & ~IORING_SETUP_R_DISABLED);
286 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
287 wake_up(&ctx->sq_data->wait);
288 return 0;
289 }
290
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)291 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
292 cpumask_var_t new_mask)
293 {
294 int ret;
295
296 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
297 ret = io_wq_cpu_affinity(current->io_uring, new_mask);
298 } else {
299 mutex_unlock(&ctx->uring_lock);
300 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
301 mutex_lock(&ctx->uring_lock);
302 }
303
304 return ret;
305 }
306
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)307 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
308 void __user *arg, unsigned len)
309 {
310 cpumask_var_t new_mask;
311 int ret;
312
313 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
314 return -ENOMEM;
315
316 cpumask_clear(new_mask);
317 if (len > cpumask_size())
318 len = cpumask_size();
319
320 #ifdef CONFIG_COMPAT
321 if (in_compat_syscall())
322 ret = compat_get_bitmap(cpumask_bits(new_mask),
323 (const compat_ulong_t __user *)arg,
324 len * 8 /* CHAR_BIT */);
325 else
326 #endif
327 ret = copy_from_user(new_mask, arg, len);
328
329 if (ret) {
330 free_cpumask_var(new_mask);
331 return -EFAULT;
332 }
333
334 ret = __io_register_iowq_aff(ctx, new_mask);
335 free_cpumask_var(new_mask);
336 return ret;
337 }
338
io_unregister_iowq_aff(struct io_ring_ctx * ctx)339 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
340 {
341 return __io_register_iowq_aff(ctx, NULL);
342 }
343
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)344 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
345 void __user *arg)
346 __must_hold(&ctx->uring_lock)
347 {
348 struct io_tctx_node *node;
349 struct io_uring_task *tctx = NULL;
350 struct io_sq_data *sqd = NULL;
351 __u32 new_count[2];
352 int i, ret;
353
354 if (copy_from_user(new_count, arg, sizeof(new_count)))
355 return -EFAULT;
356 for (i = 0; i < ARRAY_SIZE(new_count); i++)
357 if (new_count[i] > INT_MAX)
358 return -EINVAL;
359
360 if (ctx->flags & IORING_SETUP_SQPOLL) {
361 sqd = ctx->sq_data;
362 if (sqd) {
363 struct task_struct *tsk;
364
365 /*
366 * Observe the correct sqd->lock -> ctx->uring_lock
367 * ordering. Fine to drop uring_lock here, we hold
368 * a ref to the ctx.
369 */
370 refcount_inc(&sqd->refs);
371 mutex_unlock(&ctx->uring_lock);
372 mutex_lock(&sqd->lock);
373 mutex_lock(&ctx->uring_lock);
374 tsk = sqpoll_task_locked(sqd);
375 if (tsk)
376 tctx = tsk->io_uring;
377 }
378 } else {
379 tctx = current->io_uring;
380 }
381
382 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
383
384 for (i = 0; i < ARRAY_SIZE(new_count); i++)
385 if (new_count[i])
386 ctx->iowq_limits[i] = new_count[i];
387 ctx->iowq_limits_set = true;
388
389 if (tctx && tctx->io_wq) {
390 ret = io_wq_max_workers(tctx->io_wq, new_count);
391 if (ret)
392 goto err;
393 } else {
394 memset(new_count, 0, sizeof(new_count));
395 }
396
397 if (sqd) {
398 mutex_unlock(&ctx->uring_lock);
399 mutex_unlock(&sqd->lock);
400 io_put_sq_data(sqd);
401 mutex_lock(&ctx->uring_lock);
402 }
403
404 if (copy_to_user(arg, new_count, sizeof(new_count)))
405 return -EFAULT;
406
407 /* that's it for SQPOLL, only the SQPOLL task creates requests */
408 if (sqd)
409 return 0;
410
411 /* now propagate the restriction to all registered users */
412 mutex_lock(&ctx->tctx_lock);
413 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
414 tctx = node->task->io_uring;
415 if (WARN_ON_ONCE(!tctx->io_wq))
416 continue;
417
418 for (i = 0; i < ARRAY_SIZE(new_count); i++)
419 new_count[i] = ctx->iowq_limits[i];
420 /* ignore errors, it always returns zero anyway */
421 (void)io_wq_max_workers(tctx->io_wq, new_count);
422 }
423 mutex_unlock(&ctx->tctx_lock);
424 return 0;
425 err:
426 if (sqd) {
427 mutex_unlock(&ctx->uring_lock);
428 mutex_unlock(&sqd->lock);
429 io_put_sq_data(sqd);
430 mutex_lock(&ctx->uring_lock);
431 }
432 return ret;
433 }
434
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)435 static int io_register_clock(struct io_ring_ctx *ctx,
436 struct io_uring_clock_register __user *arg)
437 {
438 struct io_uring_clock_register reg;
439
440 if (copy_from_user(®, arg, sizeof(reg)))
441 return -EFAULT;
442 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
443 return -EINVAL;
444
445 switch (reg.clockid) {
446 case CLOCK_MONOTONIC:
447 ctx->clock_offset = 0;
448 break;
449 case CLOCK_BOOTTIME:
450 ctx->clock_offset = TK_OFFS_BOOT;
451 break;
452 default:
453 return -EINVAL;
454 }
455
456 ctx->clockid = reg.clockid;
457 return 0;
458 }
459
460 /*
461 * State to maintain until we can swap. Both new and old state, used for
462 * either mapping or freeing.
463 */
464 struct io_ring_ctx_rings {
465 struct io_rings *rings;
466 struct io_uring_sqe *sq_sqes;
467
468 struct io_mapped_region sq_region;
469 struct io_mapped_region ring_region;
470 };
471
io_register_free_rings(struct io_ring_ctx * ctx,struct io_ring_ctx_rings * r)472 static void io_register_free_rings(struct io_ring_ctx *ctx,
473 struct io_ring_ctx_rings *r)
474 {
475 io_free_region(ctx->user, &r->sq_region);
476 io_free_region(ctx->user, &r->ring_region);
477 }
478
479 #define swap_old(ctx, o, n, field) \
480 do { \
481 (o).field = (ctx)->field; \
482 (ctx)->field = (n).field; \
483 } while (0)
484
485 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
486 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
487 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
488 IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
489
io_register_resize_rings(struct io_ring_ctx * ctx,void __user * arg)490 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
491 {
492 struct io_ctx_config config;
493 struct io_uring_region_desc rd;
494 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
495 unsigned i, tail, old_head;
496 struct io_uring_params *p = &config.p;
497 struct io_rings_layout *rl = &config.layout;
498 int ret;
499
500 memset(&config, 0, sizeof(config));
501
502 /* limited to DEFER_TASKRUN for now */
503 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
504 return -EINVAL;
505 if (copy_from_user(p, arg, sizeof(*p)))
506 return -EFAULT;
507 if (p->flags & ~RESIZE_FLAGS)
508 return -EINVAL;
509
510 /* properties that are always inherited */
511 p->flags |= (ctx->flags & COPY_FLAGS);
512
513 ret = io_prepare_config(&config);
514 if (unlikely(ret))
515 return ret;
516
517 memset(&rd, 0, sizeof(rd));
518 rd.size = PAGE_ALIGN(rl->rings_size);
519 if (p->flags & IORING_SETUP_NO_MMAP) {
520 rd.user_addr = p->cq_off.user_addr;
521 rd.flags |= IORING_MEM_REGION_TYPE_USER;
522 }
523 ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
524 if (ret)
525 return ret;
526
527 n.rings = io_region_get_ptr(&n.ring_region);
528
529 /*
530 * At this point n.rings is shared with userspace, just like o.rings
531 * is as well. While we don't expect userspace to modify it while
532 * a resize is in progress, and it's most likely that userspace will
533 * shoot itself in the foot if it does, we can't always assume good
534 * intent... Use read/write once helpers from here on to indicate the
535 * shared nature of it.
536 */
537 WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1);
538 WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1);
539 WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries);
540 WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries);
541
542 if (copy_to_user(arg, p, sizeof(*p))) {
543 io_register_free_rings(ctx, &n);
544 return -EFAULT;
545 }
546
547 memset(&rd, 0, sizeof(rd));
548 rd.size = PAGE_ALIGN(rl->sq_size);
549 if (p->flags & IORING_SETUP_NO_MMAP) {
550 rd.user_addr = p->sq_off.user_addr;
551 rd.flags |= IORING_MEM_REGION_TYPE_USER;
552 }
553 ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
554 if (ret) {
555 io_register_free_rings(ctx, &n);
556 return ret;
557 }
558 n.sq_sqes = io_region_get_ptr(&n.sq_region);
559
560 /*
561 * If using SQPOLL, park the thread
562 */
563 if (ctx->sq_data) {
564 mutex_unlock(&ctx->uring_lock);
565 io_sq_thread_park(ctx->sq_data);
566 mutex_lock(&ctx->uring_lock);
567 }
568
569 /*
570 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
571 * any new mmap's on the ring fd. Clear out existing mappings to prevent
572 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
573 * existing rings beyond this point will fail. Not that it could proceed
574 * at this point anyway, as the io_uring mmap side needs go grab the
575 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
576 * duration of the actual swap.
577 */
578 mutex_lock(&ctx->mmap_lock);
579 spin_lock(&ctx->completion_lock);
580 o.rings = ctx->rings;
581 ctx->rings = NULL;
582 o.sq_sqes = ctx->sq_sqes;
583 ctx->sq_sqes = NULL;
584
585 /*
586 * Now copy SQ and CQ entries, if any. If either of the destination
587 * rings can't hold what is already there, then fail the operation.
588 */
589 tail = READ_ONCE(o.rings->sq.tail);
590 old_head = READ_ONCE(o.rings->sq.head);
591 if (tail - old_head > p->sq_entries)
592 goto overflow;
593 for (i = old_head; i < tail; i++) {
594 unsigned src_head = i & (ctx->sq_entries - 1);
595 unsigned dst_head = i & (p->sq_entries - 1);
596
597 n.sq_sqes[dst_head] = o.sq_sqes[src_head];
598 }
599 WRITE_ONCE(n.rings->sq.head, old_head);
600 WRITE_ONCE(n.rings->sq.tail, tail);
601
602 tail = READ_ONCE(o.rings->cq.tail);
603 old_head = READ_ONCE(o.rings->cq.head);
604 if (tail - old_head > p->cq_entries) {
605 overflow:
606 /* restore old rings, and return -EOVERFLOW via cleanup path */
607 ctx->rings = o.rings;
608 ctx->sq_sqes = o.sq_sqes;
609 to_free = &n;
610 ret = -EOVERFLOW;
611 goto out;
612 }
613 for (i = old_head; i < tail; i++) {
614 unsigned src_head = i & (ctx->cq_entries - 1);
615 unsigned dst_head = i & (p->cq_entries - 1);
616
617 n.rings->cqes[dst_head] = o.rings->cqes[src_head];
618 }
619 WRITE_ONCE(n.rings->cq.head, old_head);
620 WRITE_ONCE(n.rings->cq.tail, tail);
621 /* invalidate cached cqe refill */
622 ctx->cqe_cached = ctx->cqe_sentinel = NULL;
623
624 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
625 atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
626 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
627 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
628
629 /* all done, store old pointers and assign new ones */
630 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
631 ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);
632
633 ctx->sq_entries = p->sq_entries;
634 ctx->cq_entries = p->cq_entries;
635
636 /*
637 * Just mark any flag we may have missed and that the application
638 * should act on unconditionally. Worst case it'll be an extra
639 * syscall.
640 */
641 atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags);
642 ctx->rings = n.rings;
643 rcu_assign_pointer(ctx->rings_rcu, n.rings);
644
645 ctx->sq_sqes = n.sq_sqes;
646 swap_old(ctx, o, n, ring_region);
647 swap_old(ctx, o, n, sq_region);
648 to_free = &o;
649 ret = 0;
650 out:
651 spin_unlock(&ctx->completion_lock);
652 mutex_unlock(&ctx->mmap_lock);
653 /* Wait for concurrent io_ctx_mark_taskrun() */
654 if (to_free == &o)
655 synchronize_rcu_expedited();
656 io_register_free_rings(ctx, to_free);
657
658 if (ctx->sq_data)
659 io_sq_thread_unpark(ctx->sq_data);
660
661 return ret;
662 }
663
io_register_mem_region(struct io_ring_ctx * ctx,void __user * uarg)664 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
665 {
666 struct io_uring_mem_region_reg __user *reg_uptr = uarg;
667 struct io_uring_mem_region_reg reg;
668 struct io_uring_region_desc __user *rd_uptr;
669 struct io_uring_region_desc rd;
670 struct io_mapped_region region = {};
671 int ret;
672
673 if (io_region_is_set(&ctx->param_region))
674 return -EBUSY;
675 if (copy_from_user(®, reg_uptr, sizeof(reg)))
676 return -EFAULT;
677 rd_uptr = u64_to_user_ptr(reg.region_uptr);
678 if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
679 return -EFAULT;
680 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
681 return -EINVAL;
682 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
683 return -EINVAL;
684
685 /*
686 * This ensures there are no waiters. Waiters are unlocked and it's
687 * hard to synchronise with them, especially if we need to initialise
688 * the region.
689 */
690 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
691 !(ctx->flags & IORING_SETUP_R_DISABLED))
692 return -EINVAL;
693
694 ret = io_create_region(ctx, ®ion, &rd, IORING_MAP_OFF_PARAM_REGION);
695 if (ret)
696 return ret;
697 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
698 io_free_region(ctx->user, ®ion);
699 return -EFAULT;
700 }
701
702 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
703 ctx->cq_wait_arg = io_region_get_ptr(®ion);
704 ctx->cq_wait_size = rd.size;
705 }
706
707 io_region_publish(ctx, ®ion, &ctx->param_region);
708 return 0;
709 }
710
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)711 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
712 void __user *arg, unsigned nr_args)
713 __releases(ctx->uring_lock)
714 __acquires(ctx->uring_lock)
715 {
716 int ret;
717
718 /*
719 * We don't quiesce the refs for register anymore and so it can't be
720 * dying as we're holding a file ref here.
721 */
722 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
723 return -ENXIO;
724
725 if (ctx->submitter_task && ctx->submitter_task != current)
726 return -EEXIST;
727
728 if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
729 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
730 if (!test_bit(opcode, ctx->restrictions.register_op))
731 return -EACCES;
732 }
733
734 switch (opcode) {
735 case IORING_REGISTER_BUFFERS:
736 ret = -EFAULT;
737 if (!arg)
738 break;
739 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
740 break;
741 case IORING_UNREGISTER_BUFFERS:
742 ret = -EINVAL;
743 if (arg || nr_args)
744 break;
745 ret = io_sqe_buffers_unregister(ctx);
746 break;
747 case IORING_REGISTER_FILES:
748 ret = -EFAULT;
749 if (!arg)
750 break;
751 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
752 break;
753 case IORING_UNREGISTER_FILES:
754 ret = -EINVAL;
755 if (arg || nr_args)
756 break;
757 ret = io_sqe_files_unregister(ctx);
758 break;
759 case IORING_REGISTER_FILES_UPDATE:
760 ret = io_register_files_update(ctx, arg, nr_args);
761 break;
762 case IORING_REGISTER_EVENTFD:
763 ret = -EINVAL;
764 if (nr_args != 1)
765 break;
766 ret = io_eventfd_register(ctx, arg, 0);
767 break;
768 case IORING_REGISTER_EVENTFD_ASYNC:
769 ret = -EINVAL;
770 if (nr_args != 1)
771 break;
772 ret = io_eventfd_register(ctx, arg, 1);
773 break;
774 case IORING_UNREGISTER_EVENTFD:
775 ret = -EINVAL;
776 if (arg || nr_args)
777 break;
778 ret = io_eventfd_unregister(ctx);
779 break;
780 case IORING_REGISTER_PROBE:
781 ret = -EINVAL;
782 if (!arg || nr_args > 256)
783 break;
784 ret = io_probe(ctx, arg, nr_args);
785 break;
786 case IORING_REGISTER_PERSONALITY:
787 ret = -EINVAL;
788 if (arg || nr_args)
789 break;
790 ret = io_register_personality(ctx);
791 break;
792 case IORING_UNREGISTER_PERSONALITY:
793 ret = -EINVAL;
794 if (arg)
795 break;
796 ret = io_unregister_personality(ctx, nr_args);
797 break;
798 case IORING_REGISTER_ENABLE_RINGS:
799 ret = -EINVAL;
800 if (arg || nr_args)
801 break;
802 ret = io_register_enable_rings(ctx);
803 break;
804 case IORING_REGISTER_RESTRICTIONS:
805 ret = io_register_restrictions(ctx, arg, nr_args);
806 break;
807 case IORING_REGISTER_FILES2:
808 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
809 break;
810 case IORING_REGISTER_FILES_UPDATE2:
811 ret = io_register_rsrc_update(ctx, arg, nr_args,
812 IORING_RSRC_FILE);
813 break;
814 case IORING_REGISTER_BUFFERS2:
815 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
816 break;
817 case IORING_REGISTER_BUFFERS_UPDATE:
818 ret = io_register_rsrc_update(ctx, arg, nr_args,
819 IORING_RSRC_BUFFER);
820 break;
821 case IORING_REGISTER_IOWQ_AFF:
822 ret = -EINVAL;
823 if (!arg || !nr_args)
824 break;
825 ret = io_register_iowq_aff(ctx, arg, nr_args);
826 break;
827 case IORING_UNREGISTER_IOWQ_AFF:
828 ret = -EINVAL;
829 if (arg || nr_args)
830 break;
831 ret = io_unregister_iowq_aff(ctx);
832 break;
833 case IORING_REGISTER_IOWQ_MAX_WORKERS:
834 ret = -EINVAL;
835 if (!arg || nr_args != 2)
836 break;
837 ret = io_register_iowq_max_workers(ctx, arg);
838 break;
839 case IORING_REGISTER_RING_FDS:
840 ret = io_ringfd_register(ctx, arg, nr_args);
841 break;
842 case IORING_UNREGISTER_RING_FDS:
843 ret = io_ringfd_unregister(ctx, arg, nr_args);
844 break;
845 case IORING_REGISTER_PBUF_RING:
846 ret = -EINVAL;
847 if (!arg || nr_args != 1)
848 break;
849 ret = io_register_pbuf_ring(ctx, arg);
850 break;
851 case IORING_UNREGISTER_PBUF_RING:
852 ret = -EINVAL;
853 if (!arg || nr_args != 1)
854 break;
855 ret = io_unregister_pbuf_ring(ctx, arg);
856 break;
857 case IORING_REGISTER_SYNC_CANCEL:
858 ret = -EINVAL;
859 if (!arg || nr_args != 1)
860 break;
861 ret = io_sync_cancel(ctx, arg);
862 break;
863 case IORING_REGISTER_FILE_ALLOC_RANGE:
864 ret = -EINVAL;
865 if (!arg || nr_args)
866 break;
867 ret = io_register_file_alloc_range(ctx, arg);
868 break;
869 case IORING_REGISTER_PBUF_STATUS:
870 ret = -EINVAL;
871 if (!arg || nr_args != 1)
872 break;
873 ret = io_register_pbuf_status(ctx, arg);
874 break;
875 case IORING_REGISTER_NAPI:
876 ret = -EINVAL;
877 if (!arg || nr_args != 1)
878 break;
879 ret = io_register_napi(ctx, arg);
880 break;
881 case IORING_UNREGISTER_NAPI:
882 ret = -EINVAL;
883 if (nr_args != 1)
884 break;
885 ret = io_unregister_napi(ctx, arg);
886 break;
887 case IORING_REGISTER_CLOCK:
888 ret = -EINVAL;
889 if (!arg || nr_args)
890 break;
891 ret = io_register_clock(ctx, arg);
892 break;
893 case IORING_REGISTER_CLONE_BUFFERS:
894 ret = -EINVAL;
895 if (!arg || nr_args != 1)
896 break;
897 ret = io_register_clone_buffers(ctx, arg);
898 break;
899 case IORING_REGISTER_ZCRX_IFQ:
900 ret = -EINVAL;
901 if (!arg || nr_args != 1)
902 break;
903 ret = io_register_zcrx_ifq(ctx, arg);
904 break;
905 case IORING_REGISTER_RESIZE_RINGS:
906 ret = -EINVAL;
907 if (!arg || nr_args != 1)
908 break;
909 ret = io_register_resize_rings(ctx, arg);
910 break;
911 case IORING_REGISTER_MEM_REGION:
912 ret = -EINVAL;
913 if (!arg || nr_args != 1)
914 break;
915 ret = io_register_mem_region(ctx, arg);
916 break;
917 case IORING_REGISTER_QUERY:
918 ret = io_query(arg, nr_args);
919 break;
920 case IORING_REGISTER_ZCRX_CTRL:
921 ret = io_zcrx_ctrl(ctx, arg, nr_args);
922 break;
923 case IORING_REGISTER_BPF_FILTER:
924 ret = -EINVAL;
925
926 if (nr_args != 1)
927 break;
928 ret = io_register_bpf_filter(&ctx->restrictions, arg);
929 if (!ret)
930 WRITE_ONCE(ctx->bpf_filters,
931 ctx->restrictions.bpf_filters->filters);
932 break;
933 default:
934 ret = -EINVAL;
935 break;
936 }
937
938 return ret;
939 }
940
941 /*
942 * Given an 'fd' value, return the ctx associated with if. If 'registered' is
943 * true, then the registered index is used. Otherwise, the normal fd table.
944 * Caller must call fput() on the returned file, unless it's an ERR_PTR.
945 */
io_uring_register_get_file(unsigned int fd,bool registered)946 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
947 {
948 struct file *file;
949
950 if (registered) {
951 /*
952 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
953 * need only dereference our task private array to find it.
954 */
955 struct io_uring_task *tctx = current->io_uring;
956
957 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
958 return ERR_PTR(-EINVAL);
959 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
960 file = tctx->registered_rings[fd];
961 if (file)
962 get_file(file);
963 } else {
964 file = fget(fd);
965 }
966
967 if (unlikely(!file))
968 return ERR_PTR(-EBADF);
969 if (io_is_uring_fops(file))
970 return file;
971 fput(file);
972 return ERR_PTR(-EOPNOTSUPP);
973 }
974
io_uring_register_send_msg_ring(void __user * arg,unsigned int nr_args)975 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
976 {
977 struct io_uring_sqe sqe;
978
979 if (!arg || nr_args != 1)
980 return -EINVAL;
981 if (copy_from_user(&sqe, arg, sizeof(sqe)))
982 return -EFAULT;
983 /* no flags supported */
984 if (sqe.flags)
985 return -EINVAL;
986 if (sqe.opcode != IORING_OP_MSG_RING)
987 return -EINVAL;
988
989 return io_uring_sync_msg_ring(&sqe);
990 }
991
992 /*
993 * "blind" registration opcodes are ones where there's no ring given, and
994 * hence the source fd must be -1.
995 */
io_uring_register_blind(unsigned int opcode,void __user * arg,unsigned int nr_args)996 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
997 unsigned int nr_args)
998 {
999 switch (opcode) {
1000 case IORING_REGISTER_SEND_MSG_RING:
1001 return io_uring_register_send_msg_ring(arg, nr_args);
1002 case IORING_REGISTER_QUERY:
1003 return io_query(arg, nr_args);
1004 case IORING_REGISTER_RESTRICTIONS:
1005 return io_register_restrictions_task(arg, nr_args);
1006 case IORING_REGISTER_BPF_FILTER:
1007 return io_register_bpf_filter_task(arg, nr_args);
1008 }
1009 return -EINVAL;
1010 }
1011
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)1012 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
1013 void __user *, arg, unsigned int, nr_args)
1014 {
1015 struct io_ring_ctx *ctx;
1016 long ret = -EBADF;
1017 struct file *file;
1018 bool use_registered_ring;
1019
1020 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
1021 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
1022
1023 if (opcode >= IORING_REGISTER_LAST)
1024 return -EINVAL;
1025
1026 if (fd == -1)
1027 return io_uring_register_blind(opcode, arg, nr_args);
1028
1029 file = io_uring_register_get_file(fd, use_registered_ring);
1030 if (IS_ERR(file))
1031 return PTR_ERR(file);
1032 ctx = file->private_data;
1033
1034 mutex_lock(&ctx->uring_lock);
1035 ret = __io_uring_register(ctx, opcode, arg, nr_args);
1036
1037 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
1038 ctx->buf_table.nr, ret);
1039 mutex_unlock(&ctx->uring_lock);
1040
1041 fput(file);
1042 return ret;
1043 }
1044