1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Code related to the io_uring_register() syscall
4 *
5 * Copyright (C) 2023 Jens Axboe
6 */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30 #include "eventfd.h"
31
32 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
33 IORING_REGISTER_LAST + IORING_OP_LAST)
34
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)35 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
36 unsigned nr_args)
37 {
38 struct io_uring_probe *p;
39 size_t size;
40 int i, ret;
41
42 if (nr_args > IORING_OP_LAST)
43 nr_args = IORING_OP_LAST;
44
45 size = struct_size(p, ops, nr_args);
46 p = kzalloc(size, GFP_KERNEL);
47 if (!p)
48 return -ENOMEM;
49
50 ret = -EFAULT;
51 if (copy_from_user(p, arg, size))
52 goto out;
53 ret = -EINVAL;
54 if (memchr_inv(p, 0, size))
55 goto out;
56
57 p->last_op = IORING_OP_LAST - 1;
58
59 for (i = 0; i < nr_args; i++) {
60 p->ops[i].op = i;
61 if (io_uring_op_supported(i))
62 p->ops[i].flags = IO_URING_OP_SUPPORTED;
63 }
64 p->ops_len = i;
65
66 ret = 0;
67 if (copy_to_user(arg, p, size))
68 ret = -EFAULT;
69 out:
70 kfree(p);
71 return ret;
72 }
73
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)74 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
75 {
76 const struct cred *creds;
77
78 creds = xa_erase(&ctx->personalities, id);
79 if (creds) {
80 put_cred(creds);
81 return 0;
82 }
83
84 return -EINVAL;
85 }
86
87
io_register_personality(struct io_ring_ctx * ctx)88 static int io_register_personality(struct io_ring_ctx *ctx)
89 {
90 const struct cred *creds;
91 u32 id;
92 int ret;
93
94 creds = get_current_cred();
95
96 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
97 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
98 if (ret < 0) {
99 put_cred(creds);
100 return ret;
101 }
102 return id;
103 }
104
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)105 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
106 void __user *arg, unsigned int nr_args)
107 {
108 struct io_uring_restriction *res;
109 size_t size;
110 int i, ret;
111
112 /* Restrictions allowed only if rings started disabled */
113 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
114 return -EBADFD;
115
116 /* We allow only a single restrictions registration */
117 if (ctx->restrictions.registered)
118 return -EBUSY;
119
120 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
121 return -EINVAL;
122
123 size = array_size(nr_args, sizeof(*res));
124 if (size == SIZE_MAX)
125 return -EOVERFLOW;
126
127 res = memdup_user(arg, size);
128 if (IS_ERR(res))
129 return PTR_ERR(res);
130
131 ret = 0;
132
133 for (i = 0; i < nr_args; i++) {
134 switch (res[i].opcode) {
135 case IORING_RESTRICTION_REGISTER_OP:
136 if (res[i].register_op >= IORING_REGISTER_LAST) {
137 ret = -EINVAL;
138 goto out;
139 }
140
141 __set_bit(res[i].register_op,
142 ctx->restrictions.register_op);
143 break;
144 case IORING_RESTRICTION_SQE_OP:
145 if (res[i].sqe_op >= IORING_OP_LAST) {
146 ret = -EINVAL;
147 goto out;
148 }
149
150 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
151 break;
152 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
153 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
154 break;
155 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
156 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
157 break;
158 default:
159 ret = -EINVAL;
160 goto out;
161 }
162 }
163
164 out:
165 /* Reset all restrictions if an error happened */
166 if (ret != 0)
167 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
168 else
169 ctx->restrictions.registered = true;
170
171 kfree(res);
172 return ret;
173 }
174
io_register_enable_rings(struct io_ring_ctx * ctx)175 static int io_register_enable_rings(struct io_ring_ctx *ctx)
176 {
177 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
178 return -EBADFD;
179
180 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
181 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
182 /*
183 * Lazy activation attempts would fail if it was polled before
184 * submitter_task is set.
185 */
186 if (wq_has_sleeper(&ctx->poll_wq))
187 io_activate_pollwq(ctx);
188 }
189
190 if (ctx->restrictions.registered)
191 ctx->restricted = 1;
192
193 ctx->flags &= ~IORING_SETUP_R_DISABLED;
194 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
195 wake_up(&ctx->sq_data->wait);
196 return 0;
197 }
198
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)199 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
200 cpumask_var_t new_mask)
201 {
202 int ret;
203
204 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
205 ret = io_wq_cpu_affinity(current->io_uring, new_mask);
206 } else {
207 mutex_unlock(&ctx->uring_lock);
208 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
209 mutex_lock(&ctx->uring_lock);
210 }
211
212 return ret;
213 }
214
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)215 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
216 void __user *arg, unsigned len)
217 {
218 cpumask_var_t new_mask;
219 int ret;
220
221 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
222 return -ENOMEM;
223
224 cpumask_clear(new_mask);
225 if (len > cpumask_size())
226 len = cpumask_size();
227
228 #ifdef CONFIG_COMPAT
229 if (in_compat_syscall())
230 ret = compat_get_bitmap(cpumask_bits(new_mask),
231 (const compat_ulong_t __user *)arg,
232 len * 8 /* CHAR_BIT */);
233 else
234 #endif
235 ret = copy_from_user(new_mask, arg, len);
236
237 if (ret) {
238 free_cpumask_var(new_mask);
239 return -EFAULT;
240 }
241
242 ret = __io_register_iowq_aff(ctx, new_mask);
243 free_cpumask_var(new_mask);
244 return ret;
245 }
246
io_unregister_iowq_aff(struct io_ring_ctx * ctx)247 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
248 {
249 return __io_register_iowq_aff(ctx, NULL);
250 }
251
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)252 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
253 void __user *arg)
254 __must_hold(&ctx->uring_lock)
255 {
256 struct io_tctx_node *node;
257 struct io_uring_task *tctx = NULL;
258 struct io_sq_data *sqd = NULL;
259 __u32 new_count[2];
260 int i, ret;
261
262 if (copy_from_user(new_count, arg, sizeof(new_count)))
263 return -EFAULT;
264 for (i = 0; i < ARRAY_SIZE(new_count); i++)
265 if (new_count[i] > INT_MAX)
266 return -EINVAL;
267
268 if (ctx->flags & IORING_SETUP_SQPOLL) {
269 sqd = ctx->sq_data;
270 if (sqd) {
271 /*
272 * Observe the correct sqd->lock -> ctx->uring_lock
273 * ordering. Fine to drop uring_lock here, we hold
274 * a ref to the ctx.
275 */
276 refcount_inc(&sqd->refs);
277 mutex_unlock(&ctx->uring_lock);
278 mutex_lock(&sqd->lock);
279 mutex_lock(&ctx->uring_lock);
280 if (sqd->thread)
281 tctx = sqd->thread->io_uring;
282 }
283 } else {
284 tctx = current->io_uring;
285 }
286
287 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
288
289 for (i = 0; i < ARRAY_SIZE(new_count); i++)
290 if (new_count[i])
291 ctx->iowq_limits[i] = new_count[i];
292 ctx->iowq_limits_set = true;
293
294 if (tctx && tctx->io_wq) {
295 ret = io_wq_max_workers(tctx->io_wq, new_count);
296 if (ret)
297 goto err;
298 } else {
299 memset(new_count, 0, sizeof(new_count));
300 }
301
302 if (sqd) {
303 mutex_unlock(&ctx->uring_lock);
304 mutex_unlock(&sqd->lock);
305 io_put_sq_data(sqd);
306 mutex_lock(&ctx->uring_lock);
307 }
308
309 if (copy_to_user(arg, new_count, sizeof(new_count)))
310 return -EFAULT;
311
312 /* that's it for SQPOLL, only the SQPOLL task creates requests */
313 if (sqd)
314 return 0;
315
316 /* now propagate the restriction to all registered users */
317 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
318 tctx = node->task->io_uring;
319 if (WARN_ON_ONCE(!tctx->io_wq))
320 continue;
321
322 for (i = 0; i < ARRAY_SIZE(new_count); i++)
323 new_count[i] = ctx->iowq_limits[i];
324 /* ignore errors, it always returns zero anyway */
325 (void)io_wq_max_workers(tctx->io_wq, new_count);
326 }
327 return 0;
328 err:
329 if (sqd) {
330 mutex_unlock(&ctx->uring_lock);
331 mutex_unlock(&sqd->lock);
332 io_put_sq_data(sqd);
333 mutex_lock(&ctx->uring_lock);
334 }
335 return ret;
336 }
337
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)338 static int io_register_clock(struct io_ring_ctx *ctx,
339 struct io_uring_clock_register __user *arg)
340 {
341 struct io_uring_clock_register reg;
342
343 if (copy_from_user(®, arg, sizeof(reg)))
344 return -EFAULT;
345 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
346 return -EINVAL;
347
348 switch (reg.clockid) {
349 case CLOCK_MONOTONIC:
350 ctx->clock_offset = 0;
351 break;
352 case CLOCK_BOOTTIME:
353 ctx->clock_offset = TK_OFFS_BOOT;
354 break;
355 default:
356 return -EINVAL;
357 }
358
359 ctx->clockid = reg.clockid;
360 return 0;
361 }
362
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)363 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
364 void __user *arg, unsigned nr_args)
365 __releases(ctx->uring_lock)
366 __acquires(ctx->uring_lock)
367 {
368 int ret;
369
370 /*
371 * We don't quiesce the refs for register anymore and so it can't be
372 * dying as we're holding a file ref here.
373 */
374 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
375 return -ENXIO;
376
377 if (ctx->submitter_task && ctx->submitter_task != current)
378 return -EEXIST;
379
380 if (ctx->restricted) {
381 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
382 if (!test_bit(opcode, ctx->restrictions.register_op))
383 return -EACCES;
384 }
385
386 switch (opcode) {
387 case IORING_REGISTER_BUFFERS:
388 ret = -EFAULT;
389 if (!arg)
390 break;
391 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
392 break;
393 case IORING_UNREGISTER_BUFFERS:
394 ret = -EINVAL;
395 if (arg || nr_args)
396 break;
397 ret = io_sqe_buffers_unregister(ctx);
398 break;
399 case IORING_REGISTER_FILES:
400 ret = -EFAULT;
401 if (!arg)
402 break;
403 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
404 break;
405 case IORING_UNREGISTER_FILES:
406 ret = -EINVAL;
407 if (arg || nr_args)
408 break;
409 ret = io_sqe_files_unregister(ctx);
410 break;
411 case IORING_REGISTER_FILES_UPDATE:
412 ret = io_register_files_update(ctx, arg, nr_args);
413 break;
414 case IORING_REGISTER_EVENTFD:
415 ret = -EINVAL;
416 if (nr_args != 1)
417 break;
418 ret = io_eventfd_register(ctx, arg, 0);
419 break;
420 case IORING_REGISTER_EVENTFD_ASYNC:
421 ret = -EINVAL;
422 if (nr_args != 1)
423 break;
424 ret = io_eventfd_register(ctx, arg, 1);
425 break;
426 case IORING_UNREGISTER_EVENTFD:
427 ret = -EINVAL;
428 if (arg || nr_args)
429 break;
430 ret = io_eventfd_unregister(ctx);
431 break;
432 case IORING_REGISTER_PROBE:
433 ret = -EINVAL;
434 if (!arg || nr_args > 256)
435 break;
436 ret = io_probe(ctx, arg, nr_args);
437 break;
438 case IORING_REGISTER_PERSONALITY:
439 ret = -EINVAL;
440 if (arg || nr_args)
441 break;
442 ret = io_register_personality(ctx);
443 break;
444 case IORING_UNREGISTER_PERSONALITY:
445 ret = -EINVAL;
446 if (arg)
447 break;
448 ret = io_unregister_personality(ctx, nr_args);
449 break;
450 case IORING_REGISTER_ENABLE_RINGS:
451 ret = -EINVAL;
452 if (arg || nr_args)
453 break;
454 ret = io_register_enable_rings(ctx);
455 break;
456 case IORING_REGISTER_RESTRICTIONS:
457 ret = io_register_restrictions(ctx, arg, nr_args);
458 break;
459 case IORING_REGISTER_FILES2:
460 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
461 break;
462 case IORING_REGISTER_FILES_UPDATE2:
463 ret = io_register_rsrc_update(ctx, arg, nr_args,
464 IORING_RSRC_FILE);
465 break;
466 case IORING_REGISTER_BUFFERS2:
467 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
468 break;
469 case IORING_REGISTER_BUFFERS_UPDATE:
470 ret = io_register_rsrc_update(ctx, arg, nr_args,
471 IORING_RSRC_BUFFER);
472 break;
473 case IORING_REGISTER_IOWQ_AFF:
474 ret = -EINVAL;
475 if (!arg || !nr_args)
476 break;
477 ret = io_register_iowq_aff(ctx, arg, nr_args);
478 break;
479 case IORING_UNREGISTER_IOWQ_AFF:
480 ret = -EINVAL;
481 if (arg || nr_args)
482 break;
483 ret = io_unregister_iowq_aff(ctx);
484 break;
485 case IORING_REGISTER_IOWQ_MAX_WORKERS:
486 ret = -EINVAL;
487 if (!arg || nr_args != 2)
488 break;
489 ret = io_register_iowq_max_workers(ctx, arg);
490 break;
491 case IORING_REGISTER_RING_FDS:
492 ret = io_ringfd_register(ctx, arg, nr_args);
493 break;
494 case IORING_UNREGISTER_RING_FDS:
495 ret = io_ringfd_unregister(ctx, arg, nr_args);
496 break;
497 case IORING_REGISTER_PBUF_RING:
498 ret = -EINVAL;
499 if (!arg || nr_args != 1)
500 break;
501 ret = io_register_pbuf_ring(ctx, arg);
502 break;
503 case IORING_UNREGISTER_PBUF_RING:
504 ret = -EINVAL;
505 if (!arg || nr_args != 1)
506 break;
507 ret = io_unregister_pbuf_ring(ctx, arg);
508 break;
509 case IORING_REGISTER_SYNC_CANCEL:
510 ret = -EINVAL;
511 if (!arg || nr_args != 1)
512 break;
513 ret = io_sync_cancel(ctx, arg);
514 break;
515 case IORING_REGISTER_FILE_ALLOC_RANGE:
516 ret = -EINVAL;
517 if (!arg || nr_args)
518 break;
519 ret = io_register_file_alloc_range(ctx, arg);
520 break;
521 case IORING_REGISTER_PBUF_STATUS:
522 ret = -EINVAL;
523 if (!arg || nr_args != 1)
524 break;
525 ret = io_register_pbuf_status(ctx, arg);
526 break;
527 case IORING_REGISTER_NAPI:
528 ret = -EINVAL;
529 if (!arg || nr_args != 1)
530 break;
531 ret = io_register_napi(ctx, arg);
532 break;
533 case IORING_UNREGISTER_NAPI:
534 ret = -EINVAL;
535 if (nr_args != 1)
536 break;
537 ret = io_unregister_napi(ctx, arg);
538 break;
539 case IORING_REGISTER_CLOCK:
540 ret = -EINVAL;
541 if (!arg || nr_args)
542 break;
543 ret = io_register_clock(ctx, arg);
544 break;
545 case IORING_REGISTER_CLONE_BUFFERS:
546 ret = -EINVAL;
547 if (!arg || nr_args != 1)
548 break;
549 ret = io_register_clone_buffers(ctx, arg);
550 break;
551 default:
552 ret = -EINVAL;
553 break;
554 }
555
556 return ret;
557 }
558
559 /*
560 * Given an 'fd' value, return the ctx associated with if. If 'registered' is
561 * true, then the registered index is used. Otherwise, the normal fd table.
562 * Caller must call fput() on the returned file, unless it's an ERR_PTR.
563 */
io_uring_register_get_file(unsigned int fd,bool registered)564 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
565 {
566 struct file *file;
567
568 if (registered) {
569 /*
570 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
571 * need only dereference our task private array to find it.
572 */
573 struct io_uring_task *tctx = current->io_uring;
574
575 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
576 return ERR_PTR(-EINVAL);
577 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
578 file = tctx->registered_rings[fd];
579 } else {
580 file = fget(fd);
581 }
582
583 if (unlikely(!file))
584 return ERR_PTR(-EBADF);
585 if (io_is_uring_fops(file))
586 return file;
587 fput(file);
588 return ERR_PTR(-EOPNOTSUPP);
589 }
590
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)591 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
592 void __user *, arg, unsigned int, nr_args)
593 {
594 struct io_ring_ctx *ctx;
595 long ret = -EBADF;
596 struct file *file;
597 bool use_registered_ring;
598
599 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
600 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
601
602 if (opcode >= IORING_REGISTER_LAST)
603 return -EINVAL;
604
605 file = io_uring_register_get_file(fd, use_registered_ring);
606 if (IS_ERR(file))
607 return PTR_ERR(file);
608 ctx = file->private_data;
609
610 mutex_lock(&ctx->uring_lock);
611 ret = __io_uring_register(ctx, opcode, arg, nr_args);
612 mutex_unlock(&ctx->uring_lock);
613 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
614 if (!use_registered_ring)
615 fput(file);
616 return ret;
617 }
618