1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Code related to the io_uring_register() syscall
4 *
5 * Copyright (C) 2023 Jens Axboe
6 */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20
21 #include "filetable.h"
22 #include "io_uring.h"
23 #include "opdef.h"
24 #include "tctx.h"
25 #include "rsrc.h"
26 #include "sqpoll.h"
27 #include "register.h"
28 #include "cancel.h"
29 #include "kbuf.h"
30 #include "napi.h"
31 #include "eventfd.h"
32 #include "msg_ring.h"
33 #include "memmap.h"
34 #include "zcrx.h"
35 #include "query.h"
36
37 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
38 IORING_REGISTER_LAST + IORING_OP_LAST)
39
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)40 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
41 unsigned nr_args)
42 {
43 struct io_uring_probe *p;
44 size_t size;
45 int i, ret;
46
47 if (nr_args > IORING_OP_LAST)
48 nr_args = IORING_OP_LAST;
49
50 size = struct_size(p, ops, nr_args);
51 p = memdup_user(arg, size);
52 if (IS_ERR(p))
53 return PTR_ERR(p);
54 ret = -EINVAL;
55 if (memchr_inv(p, 0, size))
56 goto out;
57
58 p->last_op = IORING_OP_LAST - 1;
59
60 for (i = 0; i < nr_args; i++) {
61 p->ops[i].op = i;
62 if (io_uring_op_supported(i))
63 p->ops[i].flags = IO_URING_OP_SUPPORTED;
64 }
65 p->ops_len = i;
66
67 ret = 0;
68 if (copy_to_user(arg, p, size))
69 ret = -EFAULT;
70 out:
71 kfree(p);
72 return ret;
73 }
74
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)75 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
76 {
77 const struct cred *creds;
78
79 creds = xa_erase(&ctx->personalities, id);
80 if (creds) {
81 put_cred(creds);
82 return 0;
83 }
84
85 return -EINVAL;
86 }
87
88
io_register_personality(struct io_ring_ctx * ctx)89 static int io_register_personality(struct io_ring_ctx *ctx)
90 {
91 const struct cred *creds;
92 u32 id;
93 int ret;
94
95 creds = get_current_cred();
96
97 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
98 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
99 if (ret < 0) {
100 put_cred(creds);
101 return ret;
102 }
103 return id;
104 }
105
io_parse_restrictions(void __user * arg,unsigned int nr_args,struct io_restriction * restrictions)106 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
107 struct io_restriction *restrictions)
108 {
109 struct io_uring_restriction *res;
110 size_t size;
111 int i, ret;
112
113 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
114 return -EINVAL;
115
116 size = array_size(nr_args, sizeof(*res));
117 if (size == SIZE_MAX)
118 return -EOVERFLOW;
119
120 res = memdup_user(arg, size);
121 if (IS_ERR(res))
122 return PTR_ERR(res);
123
124 ret = -EINVAL;
125
126 for (i = 0; i < nr_args; i++) {
127 switch (res[i].opcode) {
128 case IORING_RESTRICTION_REGISTER_OP:
129 if (res[i].register_op >= IORING_REGISTER_LAST)
130 goto err;
131 __set_bit(res[i].register_op, restrictions->register_op);
132 break;
133 case IORING_RESTRICTION_SQE_OP:
134 if (res[i].sqe_op >= IORING_OP_LAST)
135 goto err;
136 __set_bit(res[i].sqe_op, restrictions->sqe_op);
137 break;
138 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
139 restrictions->sqe_flags_allowed = res[i].sqe_flags;
140 break;
141 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
142 restrictions->sqe_flags_required = res[i].sqe_flags;
143 break;
144 default:
145 goto err;
146 }
147 }
148
149 ret = 0;
150
151 err:
152 kfree(res);
153 return ret;
154 }
155
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)156 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
157 void __user *arg, unsigned int nr_args)
158 {
159 int ret;
160
161 /* Restrictions allowed only if rings started disabled */
162 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
163 return -EBADFD;
164
165 /* We allow only a single restrictions registration */
166 if (ctx->restrictions.registered)
167 return -EBUSY;
168
169 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
170 /* Reset all restrictions if an error happened */
171 if (ret != 0)
172 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
173 else
174 ctx->restrictions.registered = true;
175 return ret;
176 }
177
io_register_enable_rings(struct io_ring_ctx * ctx)178 static int io_register_enable_rings(struct io_ring_ctx *ctx)
179 {
180 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
181 return -EBADFD;
182
183 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
184 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
185 /*
186 * Lazy activation attempts would fail if it was polled before
187 * submitter_task is set.
188 */
189 if (wq_has_sleeper(&ctx->poll_wq))
190 io_activate_pollwq(ctx);
191 }
192
193 if (ctx->restrictions.registered)
194 ctx->restricted = 1;
195
196 ctx->flags &= ~IORING_SETUP_R_DISABLED;
197 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
198 wake_up(&ctx->sq_data->wait);
199 return 0;
200 }
201
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)202 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
203 cpumask_var_t new_mask)
204 {
205 int ret;
206
207 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
208 ret = io_wq_cpu_affinity(current->io_uring, new_mask);
209 } else {
210 mutex_unlock(&ctx->uring_lock);
211 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
212 mutex_lock(&ctx->uring_lock);
213 }
214
215 return ret;
216 }
217
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)218 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
219 void __user *arg, unsigned len)
220 {
221 cpumask_var_t new_mask;
222 int ret;
223
224 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
225 return -ENOMEM;
226
227 cpumask_clear(new_mask);
228 if (len > cpumask_size())
229 len = cpumask_size();
230
231 #ifdef CONFIG_COMPAT
232 if (in_compat_syscall())
233 ret = compat_get_bitmap(cpumask_bits(new_mask),
234 (const compat_ulong_t __user *)arg,
235 len * 8 /* CHAR_BIT */);
236 else
237 #endif
238 ret = copy_from_user(new_mask, arg, len);
239
240 if (ret) {
241 free_cpumask_var(new_mask);
242 return -EFAULT;
243 }
244
245 ret = __io_register_iowq_aff(ctx, new_mask);
246 free_cpumask_var(new_mask);
247 return ret;
248 }
249
io_unregister_iowq_aff(struct io_ring_ctx * ctx)250 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
251 {
252 return __io_register_iowq_aff(ctx, NULL);
253 }
254
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)255 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
256 void __user *arg)
257 __must_hold(&ctx->uring_lock)
258 {
259 struct io_tctx_node *node;
260 struct io_uring_task *tctx = NULL;
261 struct io_sq_data *sqd = NULL;
262 __u32 new_count[2];
263 int i, ret;
264
265 if (copy_from_user(new_count, arg, sizeof(new_count)))
266 return -EFAULT;
267 for (i = 0; i < ARRAY_SIZE(new_count); i++)
268 if (new_count[i] > INT_MAX)
269 return -EINVAL;
270
271 if (ctx->flags & IORING_SETUP_SQPOLL) {
272 sqd = ctx->sq_data;
273 if (sqd) {
274 struct task_struct *tsk;
275
276 /*
277 * Observe the correct sqd->lock -> ctx->uring_lock
278 * ordering. Fine to drop uring_lock here, we hold
279 * a ref to the ctx.
280 */
281 refcount_inc(&sqd->refs);
282 mutex_unlock(&ctx->uring_lock);
283 mutex_lock(&sqd->lock);
284 mutex_lock(&ctx->uring_lock);
285 tsk = sqpoll_task_locked(sqd);
286 if (tsk)
287 tctx = tsk->io_uring;
288 }
289 } else {
290 tctx = current->io_uring;
291 }
292
293 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
294
295 for (i = 0; i < ARRAY_SIZE(new_count); i++)
296 if (new_count[i])
297 ctx->iowq_limits[i] = new_count[i];
298 ctx->iowq_limits_set = true;
299
300 if (tctx && tctx->io_wq) {
301 ret = io_wq_max_workers(tctx->io_wq, new_count);
302 if (ret)
303 goto err;
304 } else {
305 memset(new_count, 0, sizeof(new_count));
306 }
307
308 if (sqd) {
309 mutex_unlock(&ctx->uring_lock);
310 mutex_unlock(&sqd->lock);
311 io_put_sq_data(sqd);
312 mutex_lock(&ctx->uring_lock);
313 }
314
315 if (copy_to_user(arg, new_count, sizeof(new_count)))
316 return -EFAULT;
317
318 /* that's it for SQPOLL, only the SQPOLL task creates requests */
319 if (sqd)
320 return 0;
321
322 /* now propagate the restriction to all registered users */
323 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
324 tctx = node->task->io_uring;
325 if (WARN_ON_ONCE(!tctx->io_wq))
326 continue;
327
328 for (i = 0; i < ARRAY_SIZE(new_count); i++)
329 new_count[i] = ctx->iowq_limits[i];
330 /* ignore errors, it always returns zero anyway */
331 (void)io_wq_max_workers(tctx->io_wq, new_count);
332 }
333 return 0;
334 err:
335 if (sqd) {
336 mutex_unlock(&ctx->uring_lock);
337 mutex_unlock(&sqd->lock);
338 io_put_sq_data(sqd);
339 mutex_lock(&ctx->uring_lock);
340 }
341 return ret;
342 }
343
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)344 static int io_register_clock(struct io_ring_ctx *ctx,
345 struct io_uring_clock_register __user *arg)
346 {
347 struct io_uring_clock_register reg;
348
349 if (copy_from_user(®, arg, sizeof(reg)))
350 return -EFAULT;
351 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
352 return -EINVAL;
353
354 switch (reg.clockid) {
355 case CLOCK_MONOTONIC:
356 ctx->clock_offset = 0;
357 break;
358 case CLOCK_BOOTTIME:
359 ctx->clock_offset = TK_OFFS_BOOT;
360 break;
361 default:
362 return -EINVAL;
363 }
364
365 ctx->clockid = reg.clockid;
366 return 0;
367 }
368
369 /*
370 * State to maintain until we can swap. Both new and old state, used for
371 * either mapping or freeing.
372 */
373 struct io_ring_ctx_rings {
374 struct io_rings *rings;
375 struct io_uring_sqe *sq_sqes;
376
377 struct io_mapped_region sq_region;
378 struct io_mapped_region ring_region;
379 };
380
io_register_free_rings(struct io_ring_ctx * ctx,struct io_uring_params * p,struct io_ring_ctx_rings * r)381 static void io_register_free_rings(struct io_ring_ctx *ctx,
382 struct io_uring_params *p,
383 struct io_ring_ctx_rings *r)
384 {
385 io_free_region(ctx, &r->sq_region);
386 io_free_region(ctx, &r->ring_region);
387 }
388
389 #define swap_old(ctx, o, n, field) \
390 do { \
391 (o).field = (ctx)->field; \
392 (ctx)->field = (n).field; \
393 } while (0)
394
395 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
396 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
397 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
398 IORING_SETUP_CQE_MIXED)
399
io_register_resize_rings(struct io_ring_ctx * ctx,void __user * arg)400 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
401 {
402 struct io_uring_region_desc rd;
403 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
404 size_t size, sq_array_offset;
405 unsigned i, tail, old_head;
406 struct io_uring_params p;
407 int ret;
408
409 /* limited to DEFER_TASKRUN for now */
410 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
411 return -EINVAL;
412 if (copy_from_user(&p, arg, sizeof(p)))
413 return -EFAULT;
414 if (p.flags & ~RESIZE_FLAGS)
415 return -EINVAL;
416
417 /* properties that are always inherited */
418 p.flags |= (ctx->flags & COPY_FLAGS);
419
420 ret = io_uring_fill_params(p.sq_entries, &p);
421 if (unlikely(ret))
422 return ret;
423
424 size = rings_size(p.flags, p.sq_entries, p.cq_entries,
425 &sq_array_offset);
426 if (size == SIZE_MAX)
427 return -EOVERFLOW;
428
429 memset(&rd, 0, sizeof(rd));
430 rd.size = PAGE_ALIGN(size);
431 if (p.flags & IORING_SETUP_NO_MMAP) {
432 rd.user_addr = p.cq_off.user_addr;
433 rd.flags |= IORING_MEM_REGION_TYPE_USER;
434 }
435 ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
436 if (ret) {
437 io_register_free_rings(ctx, &p, &n);
438 return ret;
439 }
440 n.rings = io_region_get_ptr(&n.ring_region);
441
442 /*
443 * At this point n.rings is shared with userspace, just like o.rings
444 * is as well. While we don't expect userspace to modify it while
445 * a resize is in progress, and it's most likely that userspace will
446 * shoot itself in the foot if it does, we can't always assume good
447 * intent... Use read/write once helpers from here on to indicate the
448 * shared nature of it.
449 */
450 WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
451 WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
452 WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
453 WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
454
455 if (copy_to_user(arg, &p, sizeof(p))) {
456 io_register_free_rings(ctx, &p, &n);
457 return -EFAULT;
458 }
459
460 if (p.flags & IORING_SETUP_SQE128)
461 size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
462 else
463 size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
464 if (size == SIZE_MAX) {
465 io_register_free_rings(ctx, &p, &n);
466 return -EOVERFLOW;
467 }
468
469 memset(&rd, 0, sizeof(rd));
470 rd.size = PAGE_ALIGN(size);
471 if (p.flags & IORING_SETUP_NO_MMAP) {
472 rd.user_addr = p.sq_off.user_addr;
473 rd.flags |= IORING_MEM_REGION_TYPE_USER;
474 }
475 ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
476 if (ret) {
477 io_register_free_rings(ctx, &p, &n);
478 return ret;
479 }
480 n.sq_sqes = io_region_get_ptr(&n.sq_region);
481
482 /*
483 * If using SQPOLL, park the thread
484 */
485 if (ctx->sq_data) {
486 mutex_unlock(&ctx->uring_lock);
487 io_sq_thread_park(ctx->sq_data);
488 mutex_lock(&ctx->uring_lock);
489 }
490
491 /*
492 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
493 * any new mmap's on the ring fd. Clear out existing mappings to prevent
494 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
495 * existing rings beyond this point will fail. Not that it could proceed
496 * at this point anyway, as the io_uring mmap side needs go grab the
497 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
498 * duration of the actual swap.
499 */
500 mutex_lock(&ctx->mmap_lock);
501 spin_lock(&ctx->completion_lock);
502 o.rings = ctx->rings;
503 ctx->rings = NULL;
504 o.sq_sqes = ctx->sq_sqes;
505 ctx->sq_sqes = NULL;
506
507 /*
508 * Now copy SQ and CQ entries, if any. If either of the destination
509 * rings can't hold what is already there, then fail the operation.
510 */
511 tail = READ_ONCE(o.rings->sq.tail);
512 old_head = READ_ONCE(o.rings->sq.head);
513 if (tail - old_head > p.sq_entries)
514 goto overflow;
515 for (i = old_head; i < tail; i++) {
516 unsigned src_head = i & (ctx->sq_entries - 1);
517 unsigned dst_head = i & (p.sq_entries - 1);
518
519 n.sq_sqes[dst_head] = o.sq_sqes[src_head];
520 }
521 WRITE_ONCE(n.rings->sq.head, old_head);
522 WRITE_ONCE(n.rings->sq.tail, tail);
523
524 tail = READ_ONCE(o.rings->cq.tail);
525 old_head = READ_ONCE(o.rings->cq.head);
526 if (tail - old_head > p.cq_entries) {
527 overflow:
528 /* restore old rings, and return -EOVERFLOW via cleanup path */
529 ctx->rings = o.rings;
530 ctx->sq_sqes = o.sq_sqes;
531 to_free = &n;
532 ret = -EOVERFLOW;
533 goto out;
534 }
535 for (i = old_head; i < tail; i++) {
536 unsigned src_head = i & (ctx->cq_entries - 1);
537 unsigned dst_head = i & (p.cq_entries - 1);
538
539 n.rings->cqes[dst_head] = o.rings->cqes[src_head];
540 }
541 WRITE_ONCE(n.rings->cq.head, old_head);
542 WRITE_ONCE(n.rings->cq.tail, tail);
543 /* invalidate cached cqe refill */
544 ctx->cqe_cached = ctx->cqe_sentinel = NULL;
545
546 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
547 atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
548 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
549 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
550
551 /* all done, store old pointers and assign new ones */
552 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
553 ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
554
555 ctx->sq_entries = p.sq_entries;
556 ctx->cq_entries = p.cq_entries;
557
558 ctx->rings = n.rings;
559 ctx->sq_sqes = n.sq_sqes;
560 swap_old(ctx, o, n, ring_region);
561 swap_old(ctx, o, n, sq_region);
562 to_free = &o;
563 ret = 0;
564 out:
565 spin_unlock(&ctx->completion_lock);
566 mutex_unlock(&ctx->mmap_lock);
567 io_register_free_rings(ctx, &p, to_free);
568
569 if (ctx->sq_data)
570 io_sq_thread_unpark(ctx->sq_data);
571
572 return ret;
573 }
574
io_register_mem_region(struct io_ring_ctx * ctx,void __user * uarg)575 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
576 {
577 struct io_uring_mem_region_reg __user *reg_uptr = uarg;
578 struct io_uring_mem_region_reg reg;
579 struct io_uring_region_desc __user *rd_uptr;
580 struct io_uring_region_desc rd;
581 int ret;
582
583 if (io_region_is_set(&ctx->param_region))
584 return -EBUSY;
585 if (copy_from_user(®, reg_uptr, sizeof(reg)))
586 return -EFAULT;
587 rd_uptr = u64_to_user_ptr(reg.region_uptr);
588 if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
589 return -EFAULT;
590 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
591 return -EINVAL;
592 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
593 return -EINVAL;
594
595 /*
596 * This ensures there are no waiters. Waiters are unlocked and it's
597 * hard to synchronise with them, especially if we need to initialise
598 * the region.
599 */
600 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
601 !(ctx->flags & IORING_SETUP_R_DISABLED))
602 return -EINVAL;
603
604 ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd,
605 IORING_MAP_OFF_PARAM_REGION);
606 if (ret)
607 return ret;
608 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
609 guard(mutex)(&ctx->mmap_lock);
610 io_free_region(ctx, &ctx->param_region);
611 return -EFAULT;
612 }
613
614 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
615 ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
616 ctx->cq_wait_size = rd.size;
617 }
618 return 0;
619 }
620
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)621 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
622 void __user *arg, unsigned nr_args)
623 __releases(ctx->uring_lock)
624 __acquires(ctx->uring_lock)
625 {
626 int ret;
627
628 /*
629 * We don't quiesce the refs for register anymore and so it can't be
630 * dying as we're holding a file ref here.
631 */
632 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
633 return -ENXIO;
634
635 if (ctx->submitter_task && ctx->submitter_task != current)
636 return -EEXIST;
637
638 if (ctx->restricted) {
639 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
640 if (!test_bit(opcode, ctx->restrictions.register_op))
641 return -EACCES;
642 }
643
644 switch (opcode) {
645 case IORING_REGISTER_BUFFERS:
646 ret = -EFAULT;
647 if (!arg)
648 break;
649 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
650 break;
651 case IORING_UNREGISTER_BUFFERS:
652 ret = -EINVAL;
653 if (arg || nr_args)
654 break;
655 ret = io_sqe_buffers_unregister(ctx);
656 break;
657 case IORING_REGISTER_FILES:
658 ret = -EFAULT;
659 if (!arg)
660 break;
661 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
662 break;
663 case IORING_UNREGISTER_FILES:
664 ret = -EINVAL;
665 if (arg || nr_args)
666 break;
667 ret = io_sqe_files_unregister(ctx);
668 break;
669 case IORING_REGISTER_FILES_UPDATE:
670 ret = io_register_files_update(ctx, arg, nr_args);
671 break;
672 case IORING_REGISTER_EVENTFD:
673 ret = -EINVAL;
674 if (nr_args != 1)
675 break;
676 ret = io_eventfd_register(ctx, arg, 0);
677 break;
678 case IORING_REGISTER_EVENTFD_ASYNC:
679 ret = -EINVAL;
680 if (nr_args != 1)
681 break;
682 ret = io_eventfd_register(ctx, arg, 1);
683 break;
684 case IORING_UNREGISTER_EVENTFD:
685 ret = -EINVAL;
686 if (arg || nr_args)
687 break;
688 ret = io_eventfd_unregister(ctx);
689 break;
690 case IORING_REGISTER_PROBE:
691 ret = -EINVAL;
692 if (!arg || nr_args > 256)
693 break;
694 ret = io_probe(ctx, arg, nr_args);
695 break;
696 case IORING_REGISTER_PERSONALITY:
697 ret = -EINVAL;
698 if (arg || nr_args)
699 break;
700 ret = io_register_personality(ctx);
701 break;
702 case IORING_UNREGISTER_PERSONALITY:
703 ret = -EINVAL;
704 if (arg)
705 break;
706 ret = io_unregister_personality(ctx, nr_args);
707 break;
708 case IORING_REGISTER_ENABLE_RINGS:
709 ret = -EINVAL;
710 if (arg || nr_args)
711 break;
712 ret = io_register_enable_rings(ctx);
713 break;
714 case IORING_REGISTER_RESTRICTIONS:
715 ret = io_register_restrictions(ctx, arg, nr_args);
716 break;
717 case IORING_REGISTER_FILES2:
718 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
719 break;
720 case IORING_REGISTER_FILES_UPDATE2:
721 ret = io_register_rsrc_update(ctx, arg, nr_args,
722 IORING_RSRC_FILE);
723 break;
724 case IORING_REGISTER_BUFFERS2:
725 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
726 break;
727 case IORING_REGISTER_BUFFERS_UPDATE:
728 ret = io_register_rsrc_update(ctx, arg, nr_args,
729 IORING_RSRC_BUFFER);
730 break;
731 case IORING_REGISTER_IOWQ_AFF:
732 ret = -EINVAL;
733 if (!arg || !nr_args)
734 break;
735 ret = io_register_iowq_aff(ctx, arg, nr_args);
736 break;
737 case IORING_UNREGISTER_IOWQ_AFF:
738 ret = -EINVAL;
739 if (arg || nr_args)
740 break;
741 ret = io_unregister_iowq_aff(ctx);
742 break;
743 case IORING_REGISTER_IOWQ_MAX_WORKERS:
744 ret = -EINVAL;
745 if (!arg || nr_args != 2)
746 break;
747 ret = io_register_iowq_max_workers(ctx, arg);
748 break;
749 case IORING_REGISTER_RING_FDS:
750 ret = io_ringfd_register(ctx, arg, nr_args);
751 break;
752 case IORING_UNREGISTER_RING_FDS:
753 ret = io_ringfd_unregister(ctx, arg, nr_args);
754 break;
755 case IORING_REGISTER_PBUF_RING:
756 ret = -EINVAL;
757 if (!arg || nr_args != 1)
758 break;
759 ret = io_register_pbuf_ring(ctx, arg);
760 break;
761 case IORING_UNREGISTER_PBUF_RING:
762 ret = -EINVAL;
763 if (!arg || nr_args != 1)
764 break;
765 ret = io_unregister_pbuf_ring(ctx, arg);
766 break;
767 case IORING_REGISTER_SYNC_CANCEL:
768 ret = -EINVAL;
769 if (!arg || nr_args != 1)
770 break;
771 ret = io_sync_cancel(ctx, arg);
772 break;
773 case IORING_REGISTER_FILE_ALLOC_RANGE:
774 ret = -EINVAL;
775 if (!arg || nr_args)
776 break;
777 ret = io_register_file_alloc_range(ctx, arg);
778 break;
779 case IORING_REGISTER_PBUF_STATUS:
780 ret = -EINVAL;
781 if (!arg || nr_args != 1)
782 break;
783 ret = io_register_pbuf_status(ctx, arg);
784 break;
785 case IORING_REGISTER_NAPI:
786 ret = -EINVAL;
787 if (!arg || nr_args != 1)
788 break;
789 ret = io_register_napi(ctx, arg);
790 break;
791 case IORING_UNREGISTER_NAPI:
792 ret = -EINVAL;
793 if (nr_args != 1)
794 break;
795 ret = io_unregister_napi(ctx, arg);
796 break;
797 case IORING_REGISTER_CLOCK:
798 ret = -EINVAL;
799 if (!arg || nr_args)
800 break;
801 ret = io_register_clock(ctx, arg);
802 break;
803 case IORING_REGISTER_CLONE_BUFFERS:
804 ret = -EINVAL;
805 if (!arg || nr_args != 1)
806 break;
807 ret = io_register_clone_buffers(ctx, arg);
808 break;
809 case IORING_REGISTER_ZCRX_IFQ:
810 ret = -EINVAL;
811 if (!arg || nr_args != 1)
812 break;
813 ret = io_register_zcrx_ifq(ctx, arg);
814 break;
815 case IORING_REGISTER_RESIZE_RINGS:
816 ret = -EINVAL;
817 if (!arg || nr_args != 1)
818 break;
819 ret = io_register_resize_rings(ctx, arg);
820 break;
821 case IORING_REGISTER_MEM_REGION:
822 ret = -EINVAL;
823 if (!arg || nr_args != 1)
824 break;
825 ret = io_register_mem_region(ctx, arg);
826 break;
827 case IORING_REGISTER_QUERY:
828 ret = io_query(ctx, arg, nr_args);
829 break;
830 default:
831 ret = -EINVAL;
832 break;
833 }
834
835 return ret;
836 }
837
838 /*
839 * Given an 'fd' value, return the ctx associated with if. If 'registered' is
840 * true, then the registered index is used. Otherwise, the normal fd table.
841 * Caller must call fput() on the returned file, unless it's an ERR_PTR.
842 */
io_uring_register_get_file(unsigned int fd,bool registered)843 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
844 {
845 struct file *file;
846
847 if (registered) {
848 /*
849 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
850 * need only dereference our task private array to find it.
851 */
852 struct io_uring_task *tctx = current->io_uring;
853
854 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
855 return ERR_PTR(-EINVAL);
856 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
857 file = tctx->registered_rings[fd];
858 if (file)
859 get_file(file);
860 } else {
861 file = fget(fd);
862 }
863
864 if (unlikely(!file))
865 return ERR_PTR(-EBADF);
866 if (io_is_uring_fops(file))
867 return file;
868 fput(file);
869 return ERR_PTR(-EOPNOTSUPP);
870 }
871
io_uring_register_send_msg_ring(void __user * arg,unsigned int nr_args)872 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
873 {
874 struct io_uring_sqe sqe;
875
876 if (!arg || nr_args != 1)
877 return -EINVAL;
878 if (copy_from_user(&sqe, arg, sizeof(sqe)))
879 return -EFAULT;
880 /* no flags supported */
881 if (sqe.flags)
882 return -EINVAL;
883 if (sqe.opcode != IORING_OP_MSG_RING)
884 return -EINVAL;
885
886 return io_uring_sync_msg_ring(&sqe);
887 }
888
889 /*
890 * "blind" registration opcodes are ones where there's no ring given, and
891 * hence the source fd must be -1.
892 */
io_uring_register_blind(unsigned int opcode,void __user * arg,unsigned int nr_args)893 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
894 unsigned int nr_args)
895 {
896 switch (opcode) {
897 case IORING_REGISTER_SEND_MSG_RING:
898 return io_uring_register_send_msg_ring(arg, nr_args);
899 case IORING_REGISTER_QUERY:
900 return io_query(NULL, arg, nr_args);
901 }
902 return -EINVAL;
903 }
904
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)905 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
906 void __user *, arg, unsigned int, nr_args)
907 {
908 struct io_ring_ctx *ctx;
909 long ret = -EBADF;
910 struct file *file;
911 bool use_registered_ring;
912
913 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
914 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
915
916 if (opcode >= IORING_REGISTER_LAST)
917 return -EINVAL;
918
919 if (fd == -1)
920 return io_uring_register_blind(opcode, arg, nr_args);
921
922 file = io_uring_register_get_file(fd, use_registered_ring);
923 if (IS_ERR(file))
924 return PTR_ERR(file);
925 ctx = file->private_data;
926
927 mutex_lock(&ctx->uring_lock);
928 ret = __io_uring_register(ctx, opcode, arg, nr_args);
929
930 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
931 ctx->buf_table.nr, ret);
932 mutex_unlock(&ctx->uring_lock);
933
934 fput(file);
935 return ret;
936 }
937