1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Code related to the io_uring_register() syscall
4 *
5 * Copyright (C) 2023 Jens Axboe
6 */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20
21 #include "filetable.h"
22 #include "io_uring.h"
23 #include "opdef.h"
24 #include "tctx.h"
25 #include "rsrc.h"
26 #include "sqpoll.h"
27 #include "register.h"
28 #include "cancel.h"
29 #include "kbuf.h"
30 #include "napi.h"
31 #include "eventfd.h"
32 #include "msg_ring.h"
33 #include "memmap.h"
34 #include "zcrx.h"
35 #include "query.h"
36
37 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
38 IORING_REGISTER_LAST + IORING_OP_LAST)
39
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)40 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
41 unsigned nr_args)
42 {
43 struct io_uring_probe *p;
44 size_t size;
45 int i, ret;
46
47 if (nr_args > IORING_OP_LAST)
48 nr_args = IORING_OP_LAST;
49
50 size = struct_size(p, ops, nr_args);
51 p = memdup_user(arg, size);
52 if (IS_ERR(p))
53 return PTR_ERR(p);
54 ret = -EINVAL;
55 if (memchr_inv(p, 0, size))
56 goto out;
57
58 p->last_op = IORING_OP_LAST - 1;
59
60 for (i = 0; i < nr_args; i++) {
61 p->ops[i].op = i;
62 if (io_uring_op_supported(i))
63 p->ops[i].flags = IO_URING_OP_SUPPORTED;
64 }
65 p->ops_len = i;
66
67 ret = 0;
68 if (copy_to_user(arg, p, size))
69 ret = -EFAULT;
70 out:
71 kfree(p);
72 return ret;
73 }
74
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)75 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
76 {
77 const struct cred *creds;
78
79 creds = xa_erase(&ctx->personalities, id);
80 if (creds) {
81 put_cred(creds);
82 return 0;
83 }
84
85 return -EINVAL;
86 }
87
88
io_register_personality(struct io_ring_ctx * ctx)89 static int io_register_personality(struct io_ring_ctx *ctx)
90 {
91 const struct cred *creds;
92 u32 id;
93 int ret;
94
95 creds = get_current_cred();
96
97 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
98 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
99 if (ret < 0) {
100 put_cred(creds);
101 return ret;
102 }
103 return id;
104 }
105
io_parse_restrictions(void __user * arg,unsigned int nr_args,struct io_restriction * restrictions)106 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
107 struct io_restriction *restrictions)
108 {
109 struct io_uring_restriction *res;
110 size_t size;
111 int i, ret;
112
113 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
114 return -EINVAL;
115
116 size = array_size(nr_args, sizeof(*res));
117 if (size == SIZE_MAX)
118 return -EOVERFLOW;
119
120 res = memdup_user(arg, size);
121 if (IS_ERR(res))
122 return PTR_ERR(res);
123
124 ret = -EINVAL;
125
126 for (i = 0; i < nr_args; i++) {
127 switch (res[i].opcode) {
128 case IORING_RESTRICTION_REGISTER_OP:
129 if (res[i].register_op >= IORING_REGISTER_LAST)
130 goto err;
131 __set_bit(res[i].register_op, restrictions->register_op);
132 break;
133 case IORING_RESTRICTION_SQE_OP:
134 if (res[i].sqe_op >= IORING_OP_LAST)
135 goto err;
136 __set_bit(res[i].sqe_op, restrictions->sqe_op);
137 break;
138 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
139 restrictions->sqe_flags_allowed = res[i].sqe_flags;
140 break;
141 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
142 restrictions->sqe_flags_required = res[i].sqe_flags;
143 break;
144 default:
145 goto err;
146 }
147 }
148
149 ret = 0;
150
151 err:
152 kfree(res);
153 return ret;
154 }
155
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)156 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
157 void __user *arg, unsigned int nr_args)
158 {
159 int ret;
160
161 /* Restrictions allowed only if rings started disabled */
162 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
163 return -EBADFD;
164
165 /* We allow only a single restrictions registration */
166 if (ctx->restrictions.registered)
167 return -EBUSY;
168
169 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
170 /* Reset all restrictions if an error happened */
171 if (ret != 0)
172 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
173 else
174 ctx->restrictions.registered = true;
175 return ret;
176 }
177
io_register_enable_rings(struct io_ring_ctx * ctx)178 static int io_register_enable_rings(struct io_ring_ctx *ctx)
179 {
180 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
181 return -EBADFD;
182
183 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
184 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
185 /*
186 * Lazy activation attempts would fail if it was polled before
187 * submitter_task is set.
188 */
189 if (wq_has_sleeper(&ctx->poll_wq))
190 io_activate_pollwq(ctx);
191 }
192
193 if (ctx->restrictions.registered)
194 ctx->restricted = 1;
195
196 ctx->flags &= ~IORING_SETUP_R_DISABLED;
197 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
198 wake_up(&ctx->sq_data->wait);
199 return 0;
200 }
201
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)202 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
203 cpumask_var_t new_mask)
204 {
205 int ret;
206
207 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
208 ret = io_wq_cpu_affinity(current->io_uring, new_mask);
209 } else {
210 mutex_unlock(&ctx->uring_lock);
211 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
212 mutex_lock(&ctx->uring_lock);
213 }
214
215 return ret;
216 }
217
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)218 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
219 void __user *arg, unsigned len)
220 {
221 cpumask_var_t new_mask;
222 int ret;
223
224 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
225 return -ENOMEM;
226
227 cpumask_clear(new_mask);
228 if (len > cpumask_size())
229 len = cpumask_size();
230
231 #ifdef CONFIG_COMPAT
232 if (in_compat_syscall())
233 ret = compat_get_bitmap(cpumask_bits(new_mask),
234 (const compat_ulong_t __user *)arg,
235 len * 8 /* CHAR_BIT */);
236 else
237 #endif
238 ret = copy_from_user(new_mask, arg, len);
239
240 if (ret) {
241 free_cpumask_var(new_mask);
242 return -EFAULT;
243 }
244
245 ret = __io_register_iowq_aff(ctx, new_mask);
246 free_cpumask_var(new_mask);
247 return ret;
248 }
249
io_unregister_iowq_aff(struct io_ring_ctx * ctx)250 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
251 {
252 return __io_register_iowq_aff(ctx, NULL);
253 }
254
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)255 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
256 void __user *arg)
257 __must_hold(&ctx->uring_lock)
258 {
259 struct io_tctx_node *node;
260 struct io_uring_task *tctx = NULL;
261 struct io_sq_data *sqd = NULL;
262 __u32 new_count[2];
263 int i, ret;
264
265 if (copy_from_user(new_count, arg, sizeof(new_count)))
266 return -EFAULT;
267 for (i = 0; i < ARRAY_SIZE(new_count); i++)
268 if (new_count[i] > INT_MAX)
269 return -EINVAL;
270
271 if (ctx->flags & IORING_SETUP_SQPOLL) {
272 sqd = ctx->sq_data;
273 if (sqd) {
274 struct task_struct *tsk;
275
276 /*
277 * Observe the correct sqd->lock -> ctx->uring_lock
278 * ordering. Fine to drop uring_lock here, we hold
279 * a ref to the ctx.
280 */
281 refcount_inc(&sqd->refs);
282 mutex_unlock(&ctx->uring_lock);
283 mutex_lock(&sqd->lock);
284 mutex_lock(&ctx->uring_lock);
285 tsk = sqpoll_task_locked(sqd);
286 if (tsk)
287 tctx = tsk->io_uring;
288 }
289 } else {
290 tctx = current->io_uring;
291 }
292
293 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
294
295 for (i = 0; i < ARRAY_SIZE(new_count); i++)
296 if (new_count[i])
297 ctx->iowq_limits[i] = new_count[i];
298 ctx->iowq_limits_set = true;
299
300 if (tctx && tctx->io_wq) {
301 ret = io_wq_max_workers(tctx->io_wq, new_count);
302 if (ret)
303 goto err;
304 } else {
305 memset(new_count, 0, sizeof(new_count));
306 }
307
308 if (sqd) {
309 mutex_unlock(&ctx->uring_lock);
310 mutex_unlock(&sqd->lock);
311 io_put_sq_data(sqd);
312 mutex_lock(&ctx->uring_lock);
313 }
314
315 if (copy_to_user(arg, new_count, sizeof(new_count)))
316 return -EFAULT;
317
318 /* that's it for SQPOLL, only the SQPOLL task creates requests */
319 if (sqd)
320 return 0;
321
322 /* now propagate the restriction to all registered users */
323 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
324 tctx = node->task->io_uring;
325 if (WARN_ON_ONCE(!tctx->io_wq))
326 continue;
327
328 for (i = 0; i < ARRAY_SIZE(new_count); i++)
329 new_count[i] = ctx->iowq_limits[i];
330 /* ignore errors, it always returns zero anyway */
331 (void)io_wq_max_workers(tctx->io_wq, new_count);
332 }
333 return 0;
334 err:
335 if (sqd) {
336 mutex_unlock(&ctx->uring_lock);
337 mutex_unlock(&sqd->lock);
338 io_put_sq_data(sqd);
339 mutex_lock(&ctx->uring_lock);
340 }
341 return ret;
342 }
343
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)344 static int io_register_clock(struct io_ring_ctx *ctx,
345 struct io_uring_clock_register __user *arg)
346 {
347 struct io_uring_clock_register reg;
348
349 if (copy_from_user(®, arg, sizeof(reg)))
350 return -EFAULT;
351 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
352 return -EINVAL;
353
354 switch (reg.clockid) {
355 case CLOCK_MONOTONIC:
356 ctx->clock_offset = 0;
357 break;
358 case CLOCK_BOOTTIME:
359 ctx->clock_offset = TK_OFFS_BOOT;
360 break;
361 default:
362 return -EINVAL;
363 }
364
365 ctx->clockid = reg.clockid;
366 return 0;
367 }
368
369 /*
370 * State to maintain until we can swap. Both new and old state, used for
371 * either mapping or freeing.
372 */
373 struct io_ring_ctx_rings {
374 struct io_rings *rings;
375 struct io_uring_sqe *sq_sqes;
376
377 struct io_mapped_region sq_region;
378 struct io_mapped_region ring_region;
379 };
380
io_register_free_rings(struct io_ring_ctx * ctx,struct io_ring_ctx_rings * r)381 static void io_register_free_rings(struct io_ring_ctx *ctx,
382 struct io_ring_ctx_rings *r)
383 {
384 io_free_region(ctx->user, &r->sq_region);
385 io_free_region(ctx->user, &r->ring_region);
386 }
387
388 #define swap_old(ctx, o, n, field) \
389 do { \
390 (o).field = (ctx)->field; \
391 (ctx)->field = (n).field; \
392 } while (0)
393
394 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
395 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
396 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
397 IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
398
io_register_resize_rings(struct io_ring_ctx * ctx,void __user * arg)399 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
400 {
401 struct io_ctx_config config;
402 struct io_uring_region_desc rd;
403 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
404 unsigned i, tail, old_head;
405 struct io_uring_params *p = &config.p;
406 struct io_rings_layout *rl = &config.layout;
407 int ret;
408
409 memset(&config, 0, sizeof(config));
410
411 /* limited to DEFER_TASKRUN for now */
412 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
413 return -EINVAL;
414 if (copy_from_user(p, arg, sizeof(*p)))
415 return -EFAULT;
416 if (p->flags & ~RESIZE_FLAGS)
417 return -EINVAL;
418
419 /* properties that are always inherited */
420 p->flags |= (ctx->flags & COPY_FLAGS);
421
422 ret = io_prepare_config(&config);
423 if (unlikely(ret))
424 return ret;
425
426 memset(&rd, 0, sizeof(rd));
427 rd.size = PAGE_ALIGN(rl->rings_size);
428 if (p->flags & IORING_SETUP_NO_MMAP) {
429 rd.user_addr = p->cq_off.user_addr;
430 rd.flags |= IORING_MEM_REGION_TYPE_USER;
431 }
432 ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
433 if (ret)
434 return ret;
435
436 n.rings = io_region_get_ptr(&n.ring_region);
437
438 /*
439 * At this point n.rings is shared with userspace, just like o.rings
440 * is as well. While we don't expect userspace to modify it while
441 * a resize is in progress, and it's most likely that userspace will
442 * shoot itself in the foot if it does, we can't always assume good
443 * intent... Use read/write once helpers from here on to indicate the
444 * shared nature of it.
445 */
446 WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1);
447 WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1);
448 WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries);
449 WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries);
450
451 if (copy_to_user(arg, p, sizeof(*p))) {
452 io_register_free_rings(ctx, &n);
453 return -EFAULT;
454 }
455
456 memset(&rd, 0, sizeof(rd));
457 rd.size = PAGE_ALIGN(rl->sq_size);
458 if (p->flags & IORING_SETUP_NO_MMAP) {
459 rd.user_addr = p->sq_off.user_addr;
460 rd.flags |= IORING_MEM_REGION_TYPE_USER;
461 }
462 ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
463 if (ret) {
464 io_register_free_rings(ctx, &n);
465 return ret;
466 }
467 n.sq_sqes = io_region_get_ptr(&n.sq_region);
468
469 /*
470 * If using SQPOLL, park the thread
471 */
472 if (ctx->sq_data) {
473 mutex_unlock(&ctx->uring_lock);
474 io_sq_thread_park(ctx->sq_data);
475 mutex_lock(&ctx->uring_lock);
476 }
477
478 /*
479 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
480 * any new mmap's on the ring fd. Clear out existing mappings to prevent
481 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
482 * existing rings beyond this point will fail. Not that it could proceed
483 * at this point anyway, as the io_uring mmap side needs go grab the
484 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
485 * duration of the actual swap.
486 */
487 mutex_lock(&ctx->mmap_lock);
488 spin_lock(&ctx->completion_lock);
489 o.rings = ctx->rings;
490 ctx->rings = NULL;
491 o.sq_sqes = ctx->sq_sqes;
492 ctx->sq_sqes = NULL;
493
494 /*
495 * Now copy SQ and CQ entries, if any. If either of the destination
496 * rings can't hold what is already there, then fail the operation.
497 */
498 tail = READ_ONCE(o.rings->sq.tail);
499 old_head = READ_ONCE(o.rings->sq.head);
500 if (tail - old_head > p->sq_entries)
501 goto overflow;
502 for (i = old_head; i < tail; i++) {
503 unsigned src_head = i & (ctx->sq_entries - 1);
504 unsigned dst_head = i & (p->sq_entries - 1);
505
506 n.sq_sqes[dst_head] = o.sq_sqes[src_head];
507 }
508 WRITE_ONCE(n.rings->sq.head, old_head);
509 WRITE_ONCE(n.rings->sq.tail, tail);
510
511 tail = READ_ONCE(o.rings->cq.tail);
512 old_head = READ_ONCE(o.rings->cq.head);
513 if (tail - old_head > p->cq_entries) {
514 overflow:
515 /* restore old rings, and return -EOVERFLOW via cleanup path */
516 ctx->rings = o.rings;
517 ctx->sq_sqes = o.sq_sqes;
518 to_free = &n;
519 ret = -EOVERFLOW;
520 goto out;
521 }
522 for (i = old_head; i < tail; i++) {
523 unsigned src_head = i & (ctx->cq_entries - 1);
524 unsigned dst_head = i & (p->cq_entries - 1);
525
526 n.rings->cqes[dst_head] = o.rings->cqes[src_head];
527 }
528 WRITE_ONCE(n.rings->cq.head, old_head);
529 WRITE_ONCE(n.rings->cq.tail, tail);
530 /* invalidate cached cqe refill */
531 ctx->cqe_cached = ctx->cqe_sentinel = NULL;
532
533 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
534 atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
535 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
536 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
537
538 /* all done, store old pointers and assign new ones */
539 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
540 ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);
541
542 ctx->sq_entries = p->sq_entries;
543 ctx->cq_entries = p->cq_entries;
544
545 ctx->rings = n.rings;
546 ctx->sq_sqes = n.sq_sqes;
547 swap_old(ctx, o, n, ring_region);
548 swap_old(ctx, o, n, sq_region);
549 to_free = &o;
550 ret = 0;
551 out:
552 spin_unlock(&ctx->completion_lock);
553 mutex_unlock(&ctx->mmap_lock);
554 io_register_free_rings(ctx, to_free);
555
556 if (ctx->sq_data)
557 io_sq_thread_unpark(ctx->sq_data);
558
559 return ret;
560 }
561
io_register_mem_region(struct io_ring_ctx * ctx,void __user * uarg)562 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
563 {
564 struct io_uring_mem_region_reg __user *reg_uptr = uarg;
565 struct io_uring_mem_region_reg reg;
566 struct io_uring_region_desc __user *rd_uptr;
567 struct io_uring_region_desc rd;
568 struct io_mapped_region region = {};
569 int ret;
570
571 if (io_region_is_set(&ctx->param_region))
572 return -EBUSY;
573 if (copy_from_user(®, reg_uptr, sizeof(reg)))
574 return -EFAULT;
575 rd_uptr = u64_to_user_ptr(reg.region_uptr);
576 if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
577 return -EFAULT;
578 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
579 return -EINVAL;
580 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
581 return -EINVAL;
582
583 /*
584 * This ensures there are no waiters. Waiters are unlocked and it's
585 * hard to synchronise with them, especially if we need to initialise
586 * the region.
587 */
588 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
589 !(ctx->flags & IORING_SETUP_R_DISABLED))
590 return -EINVAL;
591
592 ret = io_create_region(ctx, ®ion, &rd, IORING_MAP_OFF_PARAM_REGION);
593 if (ret)
594 return ret;
595 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
596 io_free_region(ctx->user, ®ion);
597 return -EFAULT;
598 }
599
600 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
601 ctx->cq_wait_arg = io_region_get_ptr(®ion);
602 ctx->cq_wait_size = rd.size;
603 }
604
605 io_region_publish(ctx, ®ion, &ctx->param_region);
606 return 0;
607 }
608
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)609 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
610 void __user *arg, unsigned nr_args)
611 __releases(ctx->uring_lock)
612 __acquires(ctx->uring_lock)
613 {
614 int ret;
615
616 /*
617 * We don't quiesce the refs for register anymore and so it can't be
618 * dying as we're holding a file ref here.
619 */
620 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
621 return -ENXIO;
622
623 if (ctx->submitter_task && ctx->submitter_task != current)
624 return -EEXIST;
625
626 if (ctx->restricted) {
627 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
628 if (!test_bit(opcode, ctx->restrictions.register_op))
629 return -EACCES;
630 }
631
632 switch (opcode) {
633 case IORING_REGISTER_BUFFERS:
634 ret = -EFAULT;
635 if (!arg)
636 break;
637 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
638 break;
639 case IORING_UNREGISTER_BUFFERS:
640 ret = -EINVAL;
641 if (arg || nr_args)
642 break;
643 ret = io_sqe_buffers_unregister(ctx);
644 break;
645 case IORING_REGISTER_FILES:
646 ret = -EFAULT;
647 if (!arg)
648 break;
649 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
650 break;
651 case IORING_UNREGISTER_FILES:
652 ret = -EINVAL;
653 if (arg || nr_args)
654 break;
655 ret = io_sqe_files_unregister(ctx);
656 break;
657 case IORING_REGISTER_FILES_UPDATE:
658 ret = io_register_files_update(ctx, arg, nr_args);
659 break;
660 case IORING_REGISTER_EVENTFD:
661 ret = -EINVAL;
662 if (nr_args != 1)
663 break;
664 ret = io_eventfd_register(ctx, arg, 0);
665 break;
666 case IORING_REGISTER_EVENTFD_ASYNC:
667 ret = -EINVAL;
668 if (nr_args != 1)
669 break;
670 ret = io_eventfd_register(ctx, arg, 1);
671 break;
672 case IORING_UNREGISTER_EVENTFD:
673 ret = -EINVAL;
674 if (arg || nr_args)
675 break;
676 ret = io_eventfd_unregister(ctx);
677 break;
678 case IORING_REGISTER_PROBE:
679 ret = -EINVAL;
680 if (!arg || nr_args > 256)
681 break;
682 ret = io_probe(ctx, arg, nr_args);
683 break;
684 case IORING_REGISTER_PERSONALITY:
685 ret = -EINVAL;
686 if (arg || nr_args)
687 break;
688 ret = io_register_personality(ctx);
689 break;
690 case IORING_UNREGISTER_PERSONALITY:
691 ret = -EINVAL;
692 if (arg)
693 break;
694 ret = io_unregister_personality(ctx, nr_args);
695 break;
696 case IORING_REGISTER_ENABLE_RINGS:
697 ret = -EINVAL;
698 if (arg || nr_args)
699 break;
700 ret = io_register_enable_rings(ctx);
701 break;
702 case IORING_REGISTER_RESTRICTIONS:
703 ret = io_register_restrictions(ctx, arg, nr_args);
704 break;
705 case IORING_REGISTER_FILES2:
706 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
707 break;
708 case IORING_REGISTER_FILES_UPDATE2:
709 ret = io_register_rsrc_update(ctx, arg, nr_args,
710 IORING_RSRC_FILE);
711 break;
712 case IORING_REGISTER_BUFFERS2:
713 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
714 break;
715 case IORING_REGISTER_BUFFERS_UPDATE:
716 ret = io_register_rsrc_update(ctx, arg, nr_args,
717 IORING_RSRC_BUFFER);
718 break;
719 case IORING_REGISTER_IOWQ_AFF:
720 ret = -EINVAL;
721 if (!arg || !nr_args)
722 break;
723 ret = io_register_iowq_aff(ctx, arg, nr_args);
724 break;
725 case IORING_UNREGISTER_IOWQ_AFF:
726 ret = -EINVAL;
727 if (arg || nr_args)
728 break;
729 ret = io_unregister_iowq_aff(ctx);
730 break;
731 case IORING_REGISTER_IOWQ_MAX_WORKERS:
732 ret = -EINVAL;
733 if (!arg || nr_args != 2)
734 break;
735 ret = io_register_iowq_max_workers(ctx, arg);
736 break;
737 case IORING_REGISTER_RING_FDS:
738 ret = io_ringfd_register(ctx, arg, nr_args);
739 break;
740 case IORING_UNREGISTER_RING_FDS:
741 ret = io_ringfd_unregister(ctx, arg, nr_args);
742 break;
743 case IORING_REGISTER_PBUF_RING:
744 ret = -EINVAL;
745 if (!arg || nr_args != 1)
746 break;
747 ret = io_register_pbuf_ring(ctx, arg);
748 break;
749 case IORING_UNREGISTER_PBUF_RING:
750 ret = -EINVAL;
751 if (!arg || nr_args != 1)
752 break;
753 ret = io_unregister_pbuf_ring(ctx, arg);
754 break;
755 case IORING_REGISTER_SYNC_CANCEL:
756 ret = -EINVAL;
757 if (!arg || nr_args != 1)
758 break;
759 ret = io_sync_cancel(ctx, arg);
760 break;
761 case IORING_REGISTER_FILE_ALLOC_RANGE:
762 ret = -EINVAL;
763 if (!arg || nr_args)
764 break;
765 ret = io_register_file_alloc_range(ctx, arg);
766 break;
767 case IORING_REGISTER_PBUF_STATUS:
768 ret = -EINVAL;
769 if (!arg || nr_args != 1)
770 break;
771 ret = io_register_pbuf_status(ctx, arg);
772 break;
773 case IORING_REGISTER_NAPI:
774 ret = -EINVAL;
775 if (!arg || nr_args != 1)
776 break;
777 ret = io_register_napi(ctx, arg);
778 break;
779 case IORING_UNREGISTER_NAPI:
780 ret = -EINVAL;
781 if (nr_args != 1)
782 break;
783 ret = io_unregister_napi(ctx, arg);
784 break;
785 case IORING_REGISTER_CLOCK:
786 ret = -EINVAL;
787 if (!arg || nr_args)
788 break;
789 ret = io_register_clock(ctx, arg);
790 break;
791 case IORING_REGISTER_CLONE_BUFFERS:
792 ret = -EINVAL;
793 if (!arg || nr_args != 1)
794 break;
795 ret = io_register_clone_buffers(ctx, arg);
796 break;
797 case IORING_REGISTER_ZCRX_IFQ:
798 ret = -EINVAL;
799 if (!arg || nr_args != 1)
800 break;
801 ret = io_register_zcrx_ifq(ctx, arg);
802 break;
803 case IORING_REGISTER_RESIZE_RINGS:
804 ret = -EINVAL;
805 if (!arg || nr_args != 1)
806 break;
807 ret = io_register_resize_rings(ctx, arg);
808 break;
809 case IORING_REGISTER_MEM_REGION:
810 ret = -EINVAL;
811 if (!arg || nr_args != 1)
812 break;
813 ret = io_register_mem_region(ctx, arg);
814 break;
815 case IORING_REGISTER_QUERY:
816 ret = io_query(arg, nr_args);
817 break;
818 case IORING_REGISTER_ZCRX_CTRL:
819 ret = io_zcrx_ctrl(ctx, arg, nr_args);
820 break;
821 default:
822 ret = -EINVAL;
823 break;
824 }
825
826 return ret;
827 }
828
829 /*
830 * Given an 'fd' value, return the ctx associated with if. If 'registered' is
831 * true, then the registered index is used. Otherwise, the normal fd table.
832 * Caller must call fput() on the returned file, unless it's an ERR_PTR.
833 */
io_uring_register_get_file(unsigned int fd,bool registered)834 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
835 {
836 struct file *file;
837
838 if (registered) {
839 /*
840 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
841 * need only dereference our task private array to find it.
842 */
843 struct io_uring_task *tctx = current->io_uring;
844
845 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
846 return ERR_PTR(-EINVAL);
847 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
848 file = tctx->registered_rings[fd];
849 if (file)
850 get_file(file);
851 } else {
852 file = fget(fd);
853 }
854
855 if (unlikely(!file))
856 return ERR_PTR(-EBADF);
857 if (io_is_uring_fops(file))
858 return file;
859 fput(file);
860 return ERR_PTR(-EOPNOTSUPP);
861 }
862
io_uring_register_send_msg_ring(void __user * arg,unsigned int nr_args)863 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
864 {
865 struct io_uring_sqe sqe;
866
867 if (!arg || nr_args != 1)
868 return -EINVAL;
869 if (copy_from_user(&sqe, arg, sizeof(sqe)))
870 return -EFAULT;
871 /* no flags supported */
872 if (sqe.flags)
873 return -EINVAL;
874 if (sqe.opcode != IORING_OP_MSG_RING)
875 return -EINVAL;
876
877 return io_uring_sync_msg_ring(&sqe);
878 }
879
880 /*
881 * "blind" registration opcodes are ones where there's no ring given, and
882 * hence the source fd must be -1.
883 */
io_uring_register_blind(unsigned int opcode,void __user * arg,unsigned int nr_args)884 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
885 unsigned int nr_args)
886 {
887 switch (opcode) {
888 case IORING_REGISTER_SEND_MSG_RING:
889 return io_uring_register_send_msg_ring(arg, nr_args);
890 case IORING_REGISTER_QUERY:
891 return io_query(arg, nr_args);
892 }
893 return -EINVAL;
894 }
895
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)896 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
897 void __user *, arg, unsigned int, nr_args)
898 {
899 struct io_ring_ctx *ctx;
900 long ret = -EBADF;
901 struct file *file;
902 bool use_registered_ring;
903
904 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
905 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
906
907 if (opcode >= IORING_REGISTER_LAST)
908 return -EINVAL;
909
910 if (fd == -1)
911 return io_uring_register_blind(opcode, arg, nr_args);
912
913 file = io_uring_register_get_file(fd, use_registered_ring);
914 if (IS_ERR(file))
915 return PTR_ERR(file);
916 ctx = file->private_data;
917
918 mutex_lock(&ctx->uring_lock);
919 ret = __io_uring_register(ctx, opcode, arg, nr_args);
920
921 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
922 ctx->buf_table.nr, ret);
923 mutex_unlock(&ctx->uring_lock);
924
925 fput(file);
926 return ret;
927 }
928