1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code related to the io_uring_register() syscall 4 * 5 * Copyright (C) 2023 Jens Axboe 6 */ 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/syscalls.h> 10 #include <linux/refcount.h> 11 #include <linux/bits.h> 12 #include <linux/fs.h> 13 #include <linux/file.h> 14 #include <linux/slab.h> 15 #include <linux/uaccess.h> 16 #include <linux/nospec.h> 17 #include <linux/compat.h> 18 #include <linux/io_uring.h> 19 #include <linux/io_uring_types.h> 20 21 #include "filetable.h" 22 #include "io_uring.h" 23 #include "opdef.h" 24 #include "tctx.h" 25 #include "rsrc.h" 26 #include "sqpoll.h" 27 #include "register.h" 28 #include "cancel.h" 29 #include "kbuf.h" 30 #include "napi.h" 31 #include "eventfd.h" 32 #include "msg_ring.h" 33 #include "memmap.h" 34 #include "zcrx.h" 35 #include "query.h" 36 37 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 38 IORING_REGISTER_LAST + IORING_OP_LAST) 39 40 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 41 unsigned nr_args) 42 { 43 struct io_uring_probe *p; 44 size_t size; 45 int i, ret; 46 47 if (nr_args > IORING_OP_LAST) 48 nr_args = IORING_OP_LAST; 49 50 size = struct_size(p, ops, nr_args); 51 p = memdup_user(arg, size); 52 if (IS_ERR(p)) 53 return PTR_ERR(p); 54 ret = -EINVAL; 55 if (memchr_inv(p, 0, size)) 56 goto out; 57 58 p->last_op = IORING_OP_LAST - 1; 59 60 for (i = 0; i < nr_args; i++) { 61 p->ops[i].op = i; 62 if (io_uring_op_supported(i)) 63 p->ops[i].flags = IO_URING_OP_SUPPORTED; 64 } 65 p->ops_len = i; 66 67 ret = 0; 68 if (copy_to_user(arg, p, size)) 69 ret = -EFAULT; 70 out: 71 kfree(p); 72 return ret; 73 } 74 75 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 76 { 77 const struct cred *creds; 78 79 creds = xa_erase(&ctx->personalities, id); 80 if (creds) { 81 put_cred(creds); 82 return 0; 83 } 84 85 return -EINVAL; 86 } 87 88 89 static int io_register_personality(struct io_ring_ctx *ctx) 90 { 91 const struct cred *creds; 92 u32 id; 93 int ret; 94 95 creds = get_current_cred(); 96 97 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 98 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 99 if (ret < 0) { 100 put_cred(creds); 101 return ret; 102 } 103 return id; 104 } 105 106 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, 107 struct io_restriction *restrictions) 108 { 109 struct io_uring_restriction *res; 110 size_t size; 111 int i, ret; 112 113 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 114 return -EINVAL; 115 116 size = array_size(nr_args, sizeof(*res)); 117 if (size == SIZE_MAX) 118 return -EOVERFLOW; 119 120 res = memdup_user(arg, size); 121 if (IS_ERR(res)) 122 return PTR_ERR(res); 123 124 ret = -EINVAL; 125 126 for (i = 0; i < nr_args; i++) { 127 switch (res[i].opcode) { 128 case IORING_RESTRICTION_REGISTER_OP: 129 if (res[i].register_op >= IORING_REGISTER_LAST) 130 goto err; 131 __set_bit(res[i].register_op, restrictions->register_op); 132 break; 133 case IORING_RESTRICTION_SQE_OP: 134 if (res[i].sqe_op >= IORING_OP_LAST) 135 goto err; 136 __set_bit(res[i].sqe_op, restrictions->sqe_op); 137 break; 138 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 139 restrictions->sqe_flags_allowed = res[i].sqe_flags; 140 break; 141 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 142 restrictions->sqe_flags_required = res[i].sqe_flags; 143 break; 144 default: 145 goto err; 146 } 147 } 148 149 ret = 0; 150 151 err: 152 kfree(res); 153 return ret; 154 } 155 156 static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 157 void __user *arg, unsigned int nr_args) 158 { 159 int ret; 160 161 /* Restrictions allowed only if rings started disabled */ 162 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 163 return -EBADFD; 164 165 /* We allow only a single restrictions registration */ 166 if (ctx->restrictions.registered) 167 return -EBUSY; 168 169 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); 170 /* Reset all restrictions if an error happened */ 171 if (ret != 0) 172 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 173 else 174 ctx->restrictions.registered = true; 175 return ret; 176 } 177 178 static int io_register_enable_rings(struct io_ring_ctx *ctx) 179 { 180 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 181 return -EBADFD; 182 183 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { 184 WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 185 /* 186 * Lazy activation attempts would fail if it was polled before 187 * submitter_task is set. 188 */ 189 if (wq_has_sleeper(&ctx->poll_wq)) 190 io_activate_pollwq(ctx); 191 } 192 193 if (ctx->restrictions.registered) 194 ctx->restricted = 1; 195 196 ctx->flags &= ~IORING_SETUP_R_DISABLED; 197 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 198 wake_up(&ctx->sq_data->wait); 199 return 0; 200 } 201 202 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 203 cpumask_var_t new_mask) 204 { 205 int ret; 206 207 if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 208 ret = io_wq_cpu_affinity(current->io_uring, new_mask); 209 } else { 210 mutex_unlock(&ctx->uring_lock); 211 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 212 mutex_lock(&ctx->uring_lock); 213 } 214 215 return ret; 216 } 217 218 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 219 void __user *arg, unsigned len) 220 { 221 cpumask_var_t new_mask; 222 int ret; 223 224 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 225 return -ENOMEM; 226 227 cpumask_clear(new_mask); 228 if (len > cpumask_size()) 229 len = cpumask_size(); 230 231 #ifdef CONFIG_COMPAT 232 if (in_compat_syscall()) 233 ret = compat_get_bitmap(cpumask_bits(new_mask), 234 (const compat_ulong_t __user *)arg, 235 len * 8 /* CHAR_BIT */); 236 else 237 #endif 238 ret = copy_from_user(new_mask, arg, len); 239 240 if (ret) { 241 free_cpumask_var(new_mask); 242 return -EFAULT; 243 } 244 245 ret = __io_register_iowq_aff(ctx, new_mask); 246 free_cpumask_var(new_mask); 247 return ret; 248 } 249 250 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 251 { 252 return __io_register_iowq_aff(ctx, NULL); 253 } 254 255 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 256 void __user *arg) 257 __must_hold(&ctx->uring_lock) 258 { 259 struct io_tctx_node *node; 260 struct io_uring_task *tctx = NULL; 261 struct io_sq_data *sqd = NULL; 262 __u32 new_count[2]; 263 int i, ret; 264 265 if (copy_from_user(new_count, arg, sizeof(new_count))) 266 return -EFAULT; 267 for (i = 0; i < ARRAY_SIZE(new_count); i++) 268 if (new_count[i] > INT_MAX) 269 return -EINVAL; 270 271 if (ctx->flags & IORING_SETUP_SQPOLL) { 272 sqd = ctx->sq_data; 273 if (sqd) { 274 struct task_struct *tsk; 275 276 /* 277 * Observe the correct sqd->lock -> ctx->uring_lock 278 * ordering. Fine to drop uring_lock here, we hold 279 * a ref to the ctx. 280 */ 281 refcount_inc(&sqd->refs); 282 mutex_unlock(&ctx->uring_lock); 283 mutex_lock(&sqd->lock); 284 mutex_lock(&ctx->uring_lock); 285 tsk = sqpoll_task_locked(sqd); 286 if (tsk) 287 tctx = tsk->io_uring; 288 } 289 } else { 290 tctx = current->io_uring; 291 } 292 293 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 294 295 for (i = 0; i < ARRAY_SIZE(new_count); i++) 296 if (new_count[i]) 297 ctx->iowq_limits[i] = new_count[i]; 298 ctx->iowq_limits_set = true; 299 300 if (tctx && tctx->io_wq) { 301 ret = io_wq_max_workers(tctx->io_wq, new_count); 302 if (ret) 303 goto err; 304 } else { 305 memset(new_count, 0, sizeof(new_count)); 306 } 307 308 if (sqd) { 309 mutex_unlock(&ctx->uring_lock); 310 mutex_unlock(&sqd->lock); 311 io_put_sq_data(sqd); 312 mutex_lock(&ctx->uring_lock); 313 } 314 315 if (copy_to_user(arg, new_count, sizeof(new_count))) 316 return -EFAULT; 317 318 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 319 if (sqd) 320 return 0; 321 322 /* now propagate the restriction to all registered users */ 323 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 324 tctx = node->task->io_uring; 325 if (WARN_ON_ONCE(!tctx->io_wq)) 326 continue; 327 328 for (i = 0; i < ARRAY_SIZE(new_count); i++) 329 new_count[i] = ctx->iowq_limits[i]; 330 /* ignore errors, it always returns zero anyway */ 331 (void)io_wq_max_workers(tctx->io_wq, new_count); 332 } 333 return 0; 334 err: 335 if (sqd) { 336 mutex_unlock(&ctx->uring_lock); 337 mutex_unlock(&sqd->lock); 338 io_put_sq_data(sqd); 339 mutex_lock(&ctx->uring_lock); 340 } 341 return ret; 342 } 343 344 static int io_register_clock(struct io_ring_ctx *ctx, 345 struct io_uring_clock_register __user *arg) 346 { 347 struct io_uring_clock_register reg; 348 349 if (copy_from_user(®, arg, sizeof(reg))) 350 return -EFAULT; 351 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) 352 return -EINVAL; 353 354 switch (reg.clockid) { 355 case CLOCK_MONOTONIC: 356 ctx->clock_offset = 0; 357 break; 358 case CLOCK_BOOTTIME: 359 ctx->clock_offset = TK_OFFS_BOOT; 360 break; 361 default: 362 return -EINVAL; 363 } 364 365 ctx->clockid = reg.clockid; 366 return 0; 367 } 368 369 /* 370 * State to maintain until we can swap. Both new and old state, used for 371 * either mapping or freeing. 372 */ 373 struct io_ring_ctx_rings { 374 struct io_rings *rings; 375 struct io_uring_sqe *sq_sqes; 376 377 struct io_mapped_region sq_region; 378 struct io_mapped_region ring_region; 379 }; 380 381 static void io_register_free_rings(struct io_ring_ctx *ctx, 382 struct io_uring_params *p, 383 struct io_ring_ctx_rings *r) 384 { 385 io_free_region(ctx, &r->sq_region); 386 io_free_region(ctx, &r->ring_region); 387 } 388 389 #define swap_old(ctx, o, n, field) \ 390 do { \ 391 (o).field = (ctx)->field; \ 392 (ctx)->field = (n).field; \ 393 } while (0) 394 395 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 396 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 397 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ 398 IORING_SETUP_CQE_MIXED) 399 400 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 401 { 402 struct io_uring_region_desc rd; 403 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; 404 size_t size, sq_array_offset; 405 unsigned i, tail, old_head; 406 struct io_uring_params p; 407 int ret; 408 409 /* limited to DEFER_TASKRUN for now */ 410 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 411 return -EINVAL; 412 if (copy_from_user(&p, arg, sizeof(p))) 413 return -EFAULT; 414 if (p.flags & ~RESIZE_FLAGS) 415 return -EINVAL; 416 417 /* properties that are always inherited */ 418 p.flags |= (ctx->flags & COPY_FLAGS); 419 420 ret = io_uring_fill_params(p.sq_entries, &p); 421 if (unlikely(ret)) 422 return ret; 423 424 /* nothing to do, but copy params back */ 425 if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { 426 if (copy_to_user(arg, &p, sizeof(p))) 427 return -EFAULT; 428 return 0; 429 } 430 431 size = rings_size(p.flags, p.sq_entries, p.cq_entries, 432 &sq_array_offset); 433 if (size == SIZE_MAX) 434 return -EOVERFLOW; 435 436 memset(&rd, 0, sizeof(rd)); 437 rd.size = PAGE_ALIGN(size); 438 if (p.flags & IORING_SETUP_NO_MMAP) { 439 rd.user_addr = p.cq_off.user_addr; 440 rd.flags |= IORING_MEM_REGION_TYPE_USER; 441 } 442 ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); 443 if (ret) { 444 io_register_free_rings(ctx, &p, &n); 445 return ret; 446 } 447 n.rings = io_region_get_ptr(&n.ring_region); 448 449 /* 450 * At this point n.rings is shared with userspace, just like o.rings 451 * is as well. While we don't expect userspace to modify it while 452 * a resize is in progress, and it's most likely that userspace will 453 * shoot itself in the foot if it does, we can't always assume good 454 * intent... Use read/write once helpers from here on to indicate the 455 * shared nature of it. 456 */ 457 WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); 458 WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); 459 WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); 460 WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); 461 462 if (copy_to_user(arg, &p, sizeof(p))) { 463 io_register_free_rings(ctx, &p, &n); 464 return -EFAULT; 465 } 466 467 if (p.flags & IORING_SETUP_SQE128) 468 size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); 469 else 470 size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); 471 if (size == SIZE_MAX) { 472 io_register_free_rings(ctx, &p, &n); 473 return -EOVERFLOW; 474 } 475 476 memset(&rd, 0, sizeof(rd)); 477 rd.size = PAGE_ALIGN(size); 478 if (p.flags & IORING_SETUP_NO_MMAP) { 479 rd.user_addr = p.sq_off.user_addr; 480 rd.flags |= IORING_MEM_REGION_TYPE_USER; 481 } 482 ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); 483 if (ret) { 484 io_register_free_rings(ctx, &p, &n); 485 return ret; 486 } 487 n.sq_sqes = io_region_get_ptr(&n.sq_region); 488 489 /* 490 * If using SQPOLL, park the thread 491 */ 492 if (ctx->sq_data) { 493 mutex_unlock(&ctx->uring_lock); 494 io_sq_thread_park(ctx->sq_data); 495 mutex_lock(&ctx->uring_lock); 496 } 497 498 /* 499 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude 500 * any new mmap's on the ring fd. Clear out existing mappings to prevent 501 * mmap from seeing them, as we'll unmap them. Any attempt to mmap 502 * existing rings beyond this point will fail. Not that it could proceed 503 * at this point anyway, as the io_uring mmap side needs go grab the 504 * ctx->mmap_lock as well. Likewise, hold the completion lock over the 505 * duration of the actual swap. 506 */ 507 mutex_lock(&ctx->mmap_lock); 508 spin_lock(&ctx->completion_lock); 509 o.rings = ctx->rings; 510 ctx->rings = NULL; 511 o.sq_sqes = ctx->sq_sqes; 512 ctx->sq_sqes = NULL; 513 514 /* 515 * Now copy SQ and CQ entries, if any. If either of the destination 516 * rings can't hold what is already there, then fail the operation. 517 */ 518 tail = READ_ONCE(o.rings->sq.tail); 519 old_head = READ_ONCE(o.rings->sq.head); 520 if (tail - old_head > p.sq_entries) 521 goto overflow; 522 for (i = old_head; i < tail; i++) { 523 unsigned src_head = i & (ctx->sq_entries - 1); 524 unsigned dst_head = i & (p.sq_entries - 1); 525 526 n.sq_sqes[dst_head] = o.sq_sqes[src_head]; 527 } 528 WRITE_ONCE(n.rings->sq.head, old_head); 529 WRITE_ONCE(n.rings->sq.tail, tail); 530 531 tail = READ_ONCE(o.rings->cq.tail); 532 old_head = READ_ONCE(o.rings->cq.head); 533 if (tail - old_head > p.cq_entries) { 534 overflow: 535 /* restore old rings, and return -EOVERFLOW via cleanup path */ 536 ctx->rings = o.rings; 537 ctx->sq_sqes = o.sq_sqes; 538 to_free = &n; 539 ret = -EOVERFLOW; 540 goto out; 541 } 542 for (i = old_head; i < tail; i++) { 543 unsigned src_head = i & (ctx->cq_entries - 1); 544 unsigned dst_head = i & (p.cq_entries - 1); 545 546 n.rings->cqes[dst_head] = o.rings->cqes[src_head]; 547 } 548 WRITE_ONCE(n.rings->cq.head, old_head); 549 WRITE_ONCE(n.rings->cq.tail, tail); 550 /* invalidate cached cqe refill */ 551 ctx->cqe_cached = ctx->cqe_sentinel = NULL; 552 553 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped)); 554 atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags)); 555 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags)); 556 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); 557 558 /* all done, store old pointers and assign new ones */ 559 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 560 ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); 561 562 ctx->sq_entries = p.sq_entries; 563 ctx->cq_entries = p.cq_entries; 564 565 ctx->rings = n.rings; 566 ctx->sq_sqes = n.sq_sqes; 567 swap_old(ctx, o, n, ring_region); 568 swap_old(ctx, o, n, sq_region); 569 to_free = &o; 570 ret = 0; 571 out: 572 spin_unlock(&ctx->completion_lock); 573 mutex_unlock(&ctx->mmap_lock); 574 io_register_free_rings(ctx, &p, to_free); 575 576 if (ctx->sq_data) 577 io_sq_thread_unpark(ctx->sq_data); 578 579 return ret; 580 } 581 582 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) 583 { 584 struct io_uring_mem_region_reg __user *reg_uptr = uarg; 585 struct io_uring_mem_region_reg reg; 586 struct io_uring_region_desc __user *rd_uptr; 587 struct io_uring_region_desc rd; 588 int ret; 589 590 if (io_region_is_set(&ctx->param_region)) 591 return -EBUSY; 592 if (copy_from_user(®, reg_uptr, sizeof(reg))) 593 return -EFAULT; 594 rd_uptr = u64_to_user_ptr(reg.region_uptr); 595 if (copy_from_user(&rd, rd_uptr, sizeof(rd))) 596 return -EFAULT; 597 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) 598 return -EINVAL; 599 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) 600 return -EINVAL; 601 602 /* 603 * This ensures there are no waiters. Waiters are unlocked and it's 604 * hard to synchronise with them, especially if we need to initialise 605 * the region. 606 */ 607 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) && 608 !(ctx->flags & IORING_SETUP_R_DISABLED)) 609 return -EINVAL; 610 611 ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, 612 IORING_MAP_OFF_PARAM_REGION); 613 if (ret) 614 return ret; 615 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { 616 io_free_region(ctx, &ctx->param_region); 617 return -EFAULT; 618 } 619 620 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { 621 ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); 622 ctx->cq_wait_size = rd.size; 623 } 624 return 0; 625 } 626 627 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 628 void __user *arg, unsigned nr_args) 629 __releases(ctx->uring_lock) 630 __acquires(ctx->uring_lock) 631 { 632 int ret; 633 634 /* 635 * We don't quiesce the refs for register anymore and so it can't be 636 * dying as we're holding a file ref here. 637 */ 638 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 639 return -ENXIO; 640 641 if (ctx->submitter_task && ctx->submitter_task != current) 642 return -EEXIST; 643 644 if (ctx->restricted) { 645 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 646 if (!test_bit(opcode, ctx->restrictions.register_op)) 647 return -EACCES; 648 } 649 650 switch (opcode) { 651 case IORING_REGISTER_BUFFERS: 652 ret = -EFAULT; 653 if (!arg) 654 break; 655 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 656 break; 657 case IORING_UNREGISTER_BUFFERS: 658 ret = -EINVAL; 659 if (arg || nr_args) 660 break; 661 ret = io_sqe_buffers_unregister(ctx); 662 break; 663 case IORING_REGISTER_FILES: 664 ret = -EFAULT; 665 if (!arg) 666 break; 667 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 668 break; 669 case IORING_UNREGISTER_FILES: 670 ret = -EINVAL; 671 if (arg || nr_args) 672 break; 673 ret = io_sqe_files_unregister(ctx); 674 break; 675 case IORING_REGISTER_FILES_UPDATE: 676 ret = io_register_files_update(ctx, arg, nr_args); 677 break; 678 case IORING_REGISTER_EVENTFD: 679 ret = -EINVAL; 680 if (nr_args != 1) 681 break; 682 ret = io_eventfd_register(ctx, arg, 0); 683 break; 684 case IORING_REGISTER_EVENTFD_ASYNC: 685 ret = -EINVAL; 686 if (nr_args != 1) 687 break; 688 ret = io_eventfd_register(ctx, arg, 1); 689 break; 690 case IORING_UNREGISTER_EVENTFD: 691 ret = -EINVAL; 692 if (arg || nr_args) 693 break; 694 ret = io_eventfd_unregister(ctx); 695 break; 696 case IORING_REGISTER_PROBE: 697 ret = -EINVAL; 698 if (!arg || nr_args > 256) 699 break; 700 ret = io_probe(ctx, arg, nr_args); 701 break; 702 case IORING_REGISTER_PERSONALITY: 703 ret = -EINVAL; 704 if (arg || nr_args) 705 break; 706 ret = io_register_personality(ctx); 707 break; 708 case IORING_UNREGISTER_PERSONALITY: 709 ret = -EINVAL; 710 if (arg) 711 break; 712 ret = io_unregister_personality(ctx, nr_args); 713 break; 714 case IORING_REGISTER_ENABLE_RINGS: 715 ret = -EINVAL; 716 if (arg || nr_args) 717 break; 718 ret = io_register_enable_rings(ctx); 719 break; 720 case IORING_REGISTER_RESTRICTIONS: 721 ret = io_register_restrictions(ctx, arg, nr_args); 722 break; 723 case IORING_REGISTER_FILES2: 724 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 725 break; 726 case IORING_REGISTER_FILES_UPDATE2: 727 ret = io_register_rsrc_update(ctx, arg, nr_args, 728 IORING_RSRC_FILE); 729 break; 730 case IORING_REGISTER_BUFFERS2: 731 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 732 break; 733 case IORING_REGISTER_BUFFERS_UPDATE: 734 ret = io_register_rsrc_update(ctx, arg, nr_args, 735 IORING_RSRC_BUFFER); 736 break; 737 case IORING_REGISTER_IOWQ_AFF: 738 ret = -EINVAL; 739 if (!arg || !nr_args) 740 break; 741 ret = io_register_iowq_aff(ctx, arg, nr_args); 742 break; 743 case IORING_UNREGISTER_IOWQ_AFF: 744 ret = -EINVAL; 745 if (arg || nr_args) 746 break; 747 ret = io_unregister_iowq_aff(ctx); 748 break; 749 case IORING_REGISTER_IOWQ_MAX_WORKERS: 750 ret = -EINVAL; 751 if (!arg || nr_args != 2) 752 break; 753 ret = io_register_iowq_max_workers(ctx, arg); 754 break; 755 case IORING_REGISTER_RING_FDS: 756 ret = io_ringfd_register(ctx, arg, nr_args); 757 break; 758 case IORING_UNREGISTER_RING_FDS: 759 ret = io_ringfd_unregister(ctx, arg, nr_args); 760 break; 761 case IORING_REGISTER_PBUF_RING: 762 ret = -EINVAL; 763 if (!arg || nr_args != 1) 764 break; 765 ret = io_register_pbuf_ring(ctx, arg); 766 break; 767 case IORING_UNREGISTER_PBUF_RING: 768 ret = -EINVAL; 769 if (!arg || nr_args != 1) 770 break; 771 ret = io_unregister_pbuf_ring(ctx, arg); 772 break; 773 case IORING_REGISTER_SYNC_CANCEL: 774 ret = -EINVAL; 775 if (!arg || nr_args != 1) 776 break; 777 ret = io_sync_cancel(ctx, arg); 778 break; 779 case IORING_REGISTER_FILE_ALLOC_RANGE: 780 ret = -EINVAL; 781 if (!arg || nr_args) 782 break; 783 ret = io_register_file_alloc_range(ctx, arg); 784 break; 785 case IORING_REGISTER_PBUF_STATUS: 786 ret = -EINVAL; 787 if (!arg || nr_args != 1) 788 break; 789 ret = io_register_pbuf_status(ctx, arg); 790 break; 791 case IORING_REGISTER_NAPI: 792 ret = -EINVAL; 793 if (!arg || nr_args != 1) 794 break; 795 ret = io_register_napi(ctx, arg); 796 break; 797 case IORING_UNREGISTER_NAPI: 798 ret = -EINVAL; 799 if (nr_args != 1) 800 break; 801 ret = io_unregister_napi(ctx, arg); 802 break; 803 case IORING_REGISTER_CLOCK: 804 ret = -EINVAL; 805 if (!arg || nr_args) 806 break; 807 ret = io_register_clock(ctx, arg); 808 break; 809 case IORING_REGISTER_CLONE_BUFFERS: 810 ret = -EINVAL; 811 if (!arg || nr_args != 1) 812 break; 813 ret = io_register_clone_buffers(ctx, arg); 814 break; 815 case IORING_REGISTER_ZCRX_IFQ: 816 ret = -EINVAL; 817 if (!arg || nr_args != 1) 818 break; 819 ret = io_register_zcrx_ifq(ctx, arg); 820 break; 821 case IORING_REGISTER_RESIZE_RINGS: 822 ret = -EINVAL; 823 if (!arg || nr_args != 1) 824 break; 825 ret = io_register_resize_rings(ctx, arg); 826 break; 827 case IORING_REGISTER_MEM_REGION: 828 ret = -EINVAL; 829 if (!arg || nr_args != 1) 830 break; 831 ret = io_register_mem_region(ctx, arg); 832 break; 833 case IORING_REGISTER_QUERY: 834 ret = io_query(ctx, arg, nr_args); 835 break; 836 case IORING_REGISTER_ZCRX_REFILL: 837 ret = io_zcrx_return_bufs(ctx, arg, nr_args); 838 break; 839 default: 840 ret = -EINVAL; 841 break; 842 } 843 844 return ret; 845 } 846 847 /* 848 * Given an 'fd' value, return the ctx associated with if. If 'registered' is 849 * true, then the registered index is used. Otherwise, the normal fd table. 850 * Caller must call fput() on the returned file, unless it's an ERR_PTR. 851 */ 852 struct file *io_uring_register_get_file(unsigned int fd, bool registered) 853 { 854 struct file *file; 855 856 if (registered) { 857 /* 858 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 859 * need only dereference our task private array to find it. 860 */ 861 struct io_uring_task *tctx = current->io_uring; 862 863 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 864 return ERR_PTR(-EINVAL); 865 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 866 file = tctx->registered_rings[fd]; 867 if (file) 868 get_file(file); 869 } else { 870 file = fget(fd); 871 } 872 873 if (unlikely(!file)) 874 return ERR_PTR(-EBADF); 875 if (io_is_uring_fops(file)) 876 return file; 877 fput(file); 878 return ERR_PTR(-EOPNOTSUPP); 879 } 880 881 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args) 882 { 883 struct io_uring_sqe sqe; 884 885 if (!arg || nr_args != 1) 886 return -EINVAL; 887 if (copy_from_user(&sqe, arg, sizeof(sqe))) 888 return -EFAULT; 889 /* no flags supported */ 890 if (sqe.flags) 891 return -EINVAL; 892 if (sqe.opcode != IORING_OP_MSG_RING) 893 return -EINVAL; 894 895 return io_uring_sync_msg_ring(&sqe); 896 } 897 898 /* 899 * "blind" registration opcodes are ones where there's no ring given, and 900 * hence the source fd must be -1. 901 */ 902 static int io_uring_register_blind(unsigned int opcode, void __user *arg, 903 unsigned int nr_args) 904 { 905 switch (opcode) { 906 case IORING_REGISTER_SEND_MSG_RING: 907 return io_uring_register_send_msg_ring(arg, nr_args); 908 case IORING_REGISTER_QUERY: 909 return io_query(NULL, arg, nr_args); 910 } 911 return -EINVAL; 912 } 913 914 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 915 void __user *, arg, unsigned int, nr_args) 916 { 917 struct io_ring_ctx *ctx; 918 long ret = -EBADF; 919 struct file *file; 920 bool use_registered_ring; 921 922 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 923 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 924 925 if (opcode >= IORING_REGISTER_LAST) 926 return -EINVAL; 927 928 if (fd == -1) 929 return io_uring_register_blind(opcode, arg, nr_args); 930 931 file = io_uring_register_get_file(fd, use_registered_ring); 932 if (IS_ERR(file)) 933 return PTR_ERR(file); 934 ctx = file->private_data; 935 936 mutex_lock(&ctx->uring_lock); 937 ret = __io_uring_register(ctx, opcode, arg, nr_args); 938 939 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, 940 ctx->buf_table.nr, ret); 941 mutex_unlock(&ctx->uring_lock); 942 943 fput(file); 944 return ret; 945 } 946