1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code related to the io_uring_register() syscall 4 * 5 * Copyright (C) 2023 Jens Axboe 6 */ 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/syscalls.h> 10 #include <linux/refcount.h> 11 #include <linux/bits.h> 12 #include <linux/fs.h> 13 #include <linux/file.h> 14 #include <linux/slab.h> 15 #include <linux/uaccess.h> 16 #include <linux/nospec.h> 17 #include <linux/compat.h> 18 #include <linux/io_uring.h> 19 #include <linux/io_uring_types.h> 20 21 #include "io_uring.h" 22 #include "opdef.h" 23 #include "tctx.h" 24 #include "rsrc.h" 25 #include "sqpoll.h" 26 #include "register.h" 27 #include "cancel.h" 28 #include "kbuf.h" 29 #include "napi.h" 30 #include "eventfd.h" 31 #include "msg_ring.h" 32 #include "memmap.h" 33 #include "zcrx.h" 34 35 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 36 IORING_REGISTER_LAST + IORING_OP_LAST) 37 38 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 39 unsigned nr_args) 40 { 41 struct io_uring_probe *p; 42 size_t size; 43 int i, ret; 44 45 if (nr_args > IORING_OP_LAST) 46 nr_args = IORING_OP_LAST; 47 48 size = struct_size(p, ops, nr_args); 49 p = kzalloc(size, GFP_KERNEL); 50 if (!p) 51 return -ENOMEM; 52 53 ret = -EFAULT; 54 if (copy_from_user(p, arg, size)) 55 goto out; 56 ret = -EINVAL; 57 if (memchr_inv(p, 0, size)) 58 goto out; 59 60 p->last_op = IORING_OP_LAST - 1; 61 62 for (i = 0; i < nr_args; i++) { 63 p->ops[i].op = i; 64 if (io_uring_op_supported(i)) 65 p->ops[i].flags = IO_URING_OP_SUPPORTED; 66 } 67 p->ops_len = i; 68 69 ret = 0; 70 if (copy_to_user(arg, p, size)) 71 ret = -EFAULT; 72 out: 73 kfree(p); 74 return ret; 75 } 76 77 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 78 { 79 const struct cred *creds; 80 81 creds = xa_erase(&ctx->personalities, id); 82 if (creds) { 83 put_cred(creds); 84 return 0; 85 } 86 87 return -EINVAL; 88 } 89 90 91 static int io_register_personality(struct io_ring_ctx *ctx) 92 { 93 const struct cred *creds; 94 u32 id; 95 int ret; 96 97 creds = get_current_cred(); 98 99 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 100 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 101 if (ret < 0) { 102 put_cred(creds); 103 return ret; 104 } 105 return id; 106 } 107 108 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, 109 struct io_restriction *restrictions) 110 { 111 struct io_uring_restriction *res; 112 size_t size; 113 int i, ret; 114 115 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 116 return -EINVAL; 117 118 size = array_size(nr_args, sizeof(*res)); 119 if (size == SIZE_MAX) 120 return -EOVERFLOW; 121 122 res = memdup_user(arg, size); 123 if (IS_ERR(res)) 124 return PTR_ERR(res); 125 126 ret = -EINVAL; 127 128 for (i = 0; i < nr_args; i++) { 129 switch (res[i].opcode) { 130 case IORING_RESTRICTION_REGISTER_OP: 131 if (res[i].register_op >= IORING_REGISTER_LAST) 132 goto err; 133 __set_bit(res[i].register_op, restrictions->register_op); 134 break; 135 case IORING_RESTRICTION_SQE_OP: 136 if (res[i].sqe_op >= IORING_OP_LAST) 137 goto err; 138 __set_bit(res[i].sqe_op, restrictions->sqe_op); 139 break; 140 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 141 restrictions->sqe_flags_allowed = res[i].sqe_flags; 142 break; 143 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 144 restrictions->sqe_flags_required = res[i].sqe_flags; 145 break; 146 default: 147 goto err; 148 } 149 } 150 151 ret = 0; 152 153 err: 154 kfree(res); 155 return ret; 156 } 157 158 static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 159 void __user *arg, unsigned int nr_args) 160 { 161 int ret; 162 163 /* Restrictions allowed only if rings started disabled */ 164 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 165 return -EBADFD; 166 167 /* We allow only a single restrictions registration */ 168 if (ctx->restrictions.registered) 169 return -EBUSY; 170 171 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); 172 /* Reset all restrictions if an error happened */ 173 if (ret != 0) 174 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 175 else 176 ctx->restrictions.registered = true; 177 return ret; 178 } 179 180 static int io_register_enable_rings(struct io_ring_ctx *ctx) 181 { 182 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 183 return -EBADFD; 184 185 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { 186 WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 187 /* 188 * Lazy activation attempts would fail if it was polled before 189 * submitter_task is set. 190 */ 191 if (wq_has_sleeper(&ctx->poll_wq)) 192 io_activate_pollwq(ctx); 193 } 194 195 if (ctx->restrictions.registered) 196 ctx->restricted = 1; 197 198 ctx->flags &= ~IORING_SETUP_R_DISABLED; 199 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 200 wake_up(&ctx->sq_data->wait); 201 return 0; 202 } 203 204 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 205 cpumask_var_t new_mask) 206 { 207 int ret; 208 209 if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 210 ret = io_wq_cpu_affinity(current->io_uring, new_mask); 211 } else { 212 mutex_unlock(&ctx->uring_lock); 213 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 214 mutex_lock(&ctx->uring_lock); 215 } 216 217 return ret; 218 } 219 220 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 221 void __user *arg, unsigned len) 222 { 223 cpumask_var_t new_mask; 224 int ret; 225 226 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 227 return -ENOMEM; 228 229 cpumask_clear(new_mask); 230 if (len > cpumask_size()) 231 len = cpumask_size(); 232 233 #ifdef CONFIG_COMPAT 234 if (in_compat_syscall()) 235 ret = compat_get_bitmap(cpumask_bits(new_mask), 236 (const compat_ulong_t __user *)arg, 237 len * 8 /* CHAR_BIT */); 238 else 239 #endif 240 ret = copy_from_user(new_mask, arg, len); 241 242 if (ret) { 243 free_cpumask_var(new_mask); 244 return -EFAULT; 245 } 246 247 ret = __io_register_iowq_aff(ctx, new_mask); 248 free_cpumask_var(new_mask); 249 return ret; 250 } 251 252 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 253 { 254 return __io_register_iowq_aff(ctx, NULL); 255 } 256 257 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 258 void __user *arg) 259 __must_hold(&ctx->uring_lock) 260 { 261 struct io_tctx_node *node; 262 struct io_uring_task *tctx = NULL; 263 struct io_sq_data *sqd = NULL; 264 __u32 new_count[2]; 265 int i, ret; 266 267 if (copy_from_user(new_count, arg, sizeof(new_count))) 268 return -EFAULT; 269 for (i = 0; i < ARRAY_SIZE(new_count); i++) 270 if (new_count[i] > INT_MAX) 271 return -EINVAL; 272 273 if (ctx->flags & IORING_SETUP_SQPOLL) { 274 sqd = ctx->sq_data; 275 if (sqd) { 276 struct task_struct *tsk; 277 278 /* 279 * Observe the correct sqd->lock -> ctx->uring_lock 280 * ordering. Fine to drop uring_lock here, we hold 281 * a ref to the ctx. 282 */ 283 refcount_inc(&sqd->refs); 284 mutex_unlock(&ctx->uring_lock); 285 mutex_lock(&sqd->lock); 286 mutex_lock(&ctx->uring_lock); 287 tsk = sqpoll_task_locked(sqd); 288 if (tsk) 289 tctx = tsk->io_uring; 290 } 291 } else { 292 tctx = current->io_uring; 293 } 294 295 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 296 297 for (i = 0; i < ARRAY_SIZE(new_count); i++) 298 if (new_count[i]) 299 ctx->iowq_limits[i] = new_count[i]; 300 ctx->iowq_limits_set = true; 301 302 if (tctx && tctx->io_wq) { 303 ret = io_wq_max_workers(tctx->io_wq, new_count); 304 if (ret) 305 goto err; 306 } else { 307 memset(new_count, 0, sizeof(new_count)); 308 } 309 310 if (sqd) { 311 mutex_unlock(&ctx->uring_lock); 312 mutex_unlock(&sqd->lock); 313 io_put_sq_data(sqd); 314 mutex_lock(&ctx->uring_lock); 315 } 316 317 if (copy_to_user(arg, new_count, sizeof(new_count))) 318 return -EFAULT; 319 320 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 321 if (sqd) 322 return 0; 323 324 /* now propagate the restriction to all registered users */ 325 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 326 tctx = node->task->io_uring; 327 if (WARN_ON_ONCE(!tctx->io_wq)) 328 continue; 329 330 for (i = 0; i < ARRAY_SIZE(new_count); i++) 331 new_count[i] = ctx->iowq_limits[i]; 332 /* ignore errors, it always returns zero anyway */ 333 (void)io_wq_max_workers(tctx->io_wq, new_count); 334 } 335 return 0; 336 err: 337 if (sqd) { 338 mutex_unlock(&ctx->uring_lock); 339 mutex_unlock(&sqd->lock); 340 io_put_sq_data(sqd); 341 mutex_lock(&ctx->uring_lock); 342 } 343 return ret; 344 } 345 346 static int io_register_clock(struct io_ring_ctx *ctx, 347 struct io_uring_clock_register __user *arg) 348 { 349 struct io_uring_clock_register reg; 350 351 if (copy_from_user(®, arg, sizeof(reg))) 352 return -EFAULT; 353 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) 354 return -EINVAL; 355 356 switch (reg.clockid) { 357 case CLOCK_MONOTONIC: 358 ctx->clock_offset = 0; 359 break; 360 case CLOCK_BOOTTIME: 361 ctx->clock_offset = TK_OFFS_BOOT; 362 break; 363 default: 364 return -EINVAL; 365 } 366 367 ctx->clockid = reg.clockid; 368 return 0; 369 } 370 371 /* 372 * State to maintain until we can swap. Both new and old state, used for 373 * either mapping or freeing. 374 */ 375 struct io_ring_ctx_rings { 376 struct io_rings *rings; 377 struct io_uring_sqe *sq_sqes; 378 379 struct io_mapped_region sq_region; 380 struct io_mapped_region ring_region; 381 }; 382 383 static void io_register_free_rings(struct io_ring_ctx *ctx, 384 struct io_uring_params *p, 385 struct io_ring_ctx_rings *r) 386 { 387 io_free_region(ctx, &r->sq_region); 388 io_free_region(ctx, &r->ring_region); 389 } 390 391 #define swap_old(ctx, o, n, field) \ 392 do { \ 393 (o).field = (ctx)->field; \ 394 (ctx)->field = (n).field; \ 395 } while (0) 396 397 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 398 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 399 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) 400 401 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 402 { 403 struct io_uring_region_desc rd; 404 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; 405 size_t size, sq_array_offset; 406 unsigned i, tail, old_head; 407 struct io_uring_params p; 408 int ret; 409 410 /* for single issuer, must be owner resizing */ 411 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && 412 current != ctx->submitter_task) 413 return -EEXIST; 414 /* limited to DEFER_TASKRUN for now */ 415 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 416 return -EINVAL; 417 if (copy_from_user(&p, arg, sizeof(p))) 418 return -EFAULT; 419 if (p.flags & ~RESIZE_FLAGS) 420 return -EINVAL; 421 422 /* properties that are always inherited */ 423 p.flags |= (ctx->flags & COPY_FLAGS); 424 425 ret = io_uring_fill_params(p.sq_entries, &p); 426 if (unlikely(ret)) 427 return ret; 428 429 /* nothing to do, but copy params back */ 430 if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { 431 if (copy_to_user(arg, &p, sizeof(p))) 432 return -EFAULT; 433 return 0; 434 } 435 436 size = rings_size(p.flags, p.sq_entries, p.cq_entries, 437 &sq_array_offset); 438 if (size == SIZE_MAX) 439 return -EOVERFLOW; 440 441 memset(&rd, 0, sizeof(rd)); 442 rd.size = PAGE_ALIGN(size); 443 if (p.flags & IORING_SETUP_NO_MMAP) { 444 rd.user_addr = p.cq_off.user_addr; 445 rd.flags |= IORING_MEM_REGION_TYPE_USER; 446 } 447 ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); 448 if (ret) { 449 io_register_free_rings(ctx, &p, &n); 450 return ret; 451 } 452 n.rings = io_region_get_ptr(&n.ring_region); 453 454 /* 455 * At this point n.rings is shared with userspace, just like o.rings 456 * is as well. While we don't expect userspace to modify it while 457 * a resize is in progress, and it's most likely that userspace will 458 * shoot itself in the foot if it does, we can't always assume good 459 * intent... Use read/write once helpers from here on to indicate the 460 * shared nature of it. 461 */ 462 WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); 463 WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); 464 WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); 465 WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); 466 467 if (copy_to_user(arg, &p, sizeof(p))) { 468 io_register_free_rings(ctx, &p, &n); 469 return -EFAULT; 470 } 471 472 if (p.flags & IORING_SETUP_SQE128) 473 size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); 474 else 475 size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); 476 if (size == SIZE_MAX) { 477 io_register_free_rings(ctx, &p, &n); 478 return -EOVERFLOW; 479 } 480 481 memset(&rd, 0, sizeof(rd)); 482 rd.size = PAGE_ALIGN(size); 483 if (p.flags & IORING_SETUP_NO_MMAP) { 484 rd.user_addr = p.sq_off.user_addr; 485 rd.flags |= IORING_MEM_REGION_TYPE_USER; 486 } 487 ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); 488 if (ret) { 489 io_register_free_rings(ctx, &p, &n); 490 return ret; 491 } 492 n.sq_sqes = io_region_get_ptr(&n.sq_region); 493 494 /* 495 * If using SQPOLL, park the thread 496 */ 497 if (ctx->sq_data) { 498 mutex_unlock(&ctx->uring_lock); 499 io_sq_thread_park(ctx->sq_data); 500 mutex_lock(&ctx->uring_lock); 501 } 502 503 /* 504 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude 505 * any new mmap's on the ring fd. Clear out existing mappings to prevent 506 * mmap from seeing them, as we'll unmap them. Any attempt to mmap 507 * existing rings beyond this point will fail. Not that it could proceed 508 * at this point anyway, as the io_uring mmap side needs go grab the 509 * ctx->mmap_lock as well. Likewise, hold the completion lock over the 510 * duration of the actual swap. 511 */ 512 mutex_lock(&ctx->mmap_lock); 513 spin_lock(&ctx->completion_lock); 514 o.rings = ctx->rings; 515 ctx->rings = NULL; 516 o.sq_sqes = ctx->sq_sqes; 517 ctx->sq_sqes = NULL; 518 519 /* 520 * Now copy SQ and CQ entries, if any. If either of the destination 521 * rings can't hold what is already there, then fail the operation. 522 */ 523 tail = READ_ONCE(o.rings->sq.tail); 524 old_head = READ_ONCE(o.rings->sq.head); 525 if (tail - old_head > p.sq_entries) 526 goto overflow; 527 for (i = old_head; i < tail; i++) { 528 unsigned src_head = i & (ctx->sq_entries - 1); 529 unsigned dst_head = i & (p.sq_entries - 1); 530 531 n.sq_sqes[dst_head] = o.sq_sqes[src_head]; 532 } 533 WRITE_ONCE(n.rings->sq.head, old_head); 534 WRITE_ONCE(n.rings->sq.tail, tail); 535 536 tail = READ_ONCE(o.rings->cq.tail); 537 old_head = READ_ONCE(o.rings->cq.head); 538 if (tail - old_head > p.cq_entries) { 539 overflow: 540 /* restore old rings, and return -EOVERFLOW via cleanup path */ 541 ctx->rings = o.rings; 542 ctx->sq_sqes = o.sq_sqes; 543 to_free = &n; 544 ret = -EOVERFLOW; 545 goto out; 546 } 547 for (i = old_head; i < tail; i++) { 548 unsigned src_head = i & (ctx->cq_entries - 1); 549 unsigned dst_head = i & (p.cq_entries - 1); 550 551 n.rings->cqes[dst_head] = o.rings->cqes[src_head]; 552 } 553 WRITE_ONCE(n.rings->cq.head, old_head); 554 WRITE_ONCE(n.rings->cq.tail, tail); 555 /* invalidate cached cqe refill */ 556 ctx->cqe_cached = ctx->cqe_sentinel = NULL; 557 558 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped)); 559 atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags)); 560 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags)); 561 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); 562 563 /* all done, store old pointers and assign new ones */ 564 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 565 ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); 566 567 ctx->sq_entries = p.sq_entries; 568 ctx->cq_entries = p.cq_entries; 569 570 ctx->rings = n.rings; 571 ctx->sq_sqes = n.sq_sqes; 572 swap_old(ctx, o, n, ring_region); 573 swap_old(ctx, o, n, sq_region); 574 to_free = &o; 575 ret = 0; 576 out: 577 spin_unlock(&ctx->completion_lock); 578 mutex_unlock(&ctx->mmap_lock); 579 io_register_free_rings(ctx, &p, to_free); 580 581 if (ctx->sq_data) 582 io_sq_thread_unpark(ctx->sq_data); 583 584 return ret; 585 } 586 587 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) 588 { 589 struct io_uring_mem_region_reg __user *reg_uptr = uarg; 590 struct io_uring_mem_region_reg reg; 591 struct io_uring_region_desc __user *rd_uptr; 592 struct io_uring_region_desc rd; 593 int ret; 594 595 if (io_region_is_set(&ctx->param_region)) 596 return -EBUSY; 597 if (copy_from_user(®, reg_uptr, sizeof(reg))) 598 return -EFAULT; 599 rd_uptr = u64_to_user_ptr(reg.region_uptr); 600 if (copy_from_user(&rd, rd_uptr, sizeof(rd))) 601 return -EFAULT; 602 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) 603 return -EINVAL; 604 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) 605 return -EINVAL; 606 607 /* 608 * This ensures there are no waiters. Waiters are unlocked and it's 609 * hard to synchronise with them, especially if we need to initialise 610 * the region. 611 */ 612 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) && 613 !(ctx->flags & IORING_SETUP_R_DISABLED)) 614 return -EINVAL; 615 616 ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, 617 IORING_MAP_OFF_PARAM_REGION); 618 if (ret) 619 return ret; 620 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { 621 io_free_region(ctx, &ctx->param_region); 622 return -EFAULT; 623 } 624 625 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { 626 ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); 627 ctx->cq_wait_size = rd.size; 628 } 629 return 0; 630 } 631 632 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 633 void __user *arg, unsigned nr_args) 634 __releases(ctx->uring_lock) 635 __acquires(ctx->uring_lock) 636 { 637 int ret; 638 639 /* 640 * We don't quiesce the refs for register anymore and so it can't be 641 * dying as we're holding a file ref here. 642 */ 643 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 644 return -ENXIO; 645 646 if (ctx->submitter_task && ctx->submitter_task != current) 647 return -EEXIST; 648 649 if (ctx->restricted) { 650 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 651 if (!test_bit(opcode, ctx->restrictions.register_op)) 652 return -EACCES; 653 } 654 655 switch (opcode) { 656 case IORING_REGISTER_BUFFERS: 657 ret = -EFAULT; 658 if (!arg) 659 break; 660 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 661 break; 662 case IORING_UNREGISTER_BUFFERS: 663 ret = -EINVAL; 664 if (arg || nr_args) 665 break; 666 ret = io_sqe_buffers_unregister(ctx); 667 break; 668 case IORING_REGISTER_FILES: 669 ret = -EFAULT; 670 if (!arg) 671 break; 672 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 673 break; 674 case IORING_UNREGISTER_FILES: 675 ret = -EINVAL; 676 if (arg || nr_args) 677 break; 678 ret = io_sqe_files_unregister(ctx); 679 break; 680 case IORING_REGISTER_FILES_UPDATE: 681 ret = io_register_files_update(ctx, arg, nr_args); 682 break; 683 case IORING_REGISTER_EVENTFD: 684 ret = -EINVAL; 685 if (nr_args != 1) 686 break; 687 ret = io_eventfd_register(ctx, arg, 0); 688 break; 689 case IORING_REGISTER_EVENTFD_ASYNC: 690 ret = -EINVAL; 691 if (nr_args != 1) 692 break; 693 ret = io_eventfd_register(ctx, arg, 1); 694 break; 695 case IORING_UNREGISTER_EVENTFD: 696 ret = -EINVAL; 697 if (arg || nr_args) 698 break; 699 ret = io_eventfd_unregister(ctx); 700 break; 701 case IORING_REGISTER_PROBE: 702 ret = -EINVAL; 703 if (!arg || nr_args > 256) 704 break; 705 ret = io_probe(ctx, arg, nr_args); 706 break; 707 case IORING_REGISTER_PERSONALITY: 708 ret = -EINVAL; 709 if (arg || nr_args) 710 break; 711 ret = io_register_personality(ctx); 712 break; 713 case IORING_UNREGISTER_PERSONALITY: 714 ret = -EINVAL; 715 if (arg) 716 break; 717 ret = io_unregister_personality(ctx, nr_args); 718 break; 719 case IORING_REGISTER_ENABLE_RINGS: 720 ret = -EINVAL; 721 if (arg || nr_args) 722 break; 723 ret = io_register_enable_rings(ctx); 724 break; 725 case IORING_REGISTER_RESTRICTIONS: 726 ret = io_register_restrictions(ctx, arg, nr_args); 727 break; 728 case IORING_REGISTER_FILES2: 729 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 730 break; 731 case IORING_REGISTER_FILES_UPDATE2: 732 ret = io_register_rsrc_update(ctx, arg, nr_args, 733 IORING_RSRC_FILE); 734 break; 735 case IORING_REGISTER_BUFFERS2: 736 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 737 break; 738 case IORING_REGISTER_BUFFERS_UPDATE: 739 ret = io_register_rsrc_update(ctx, arg, nr_args, 740 IORING_RSRC_BUFFER); 741 break; 742 case IORING_REGISTER_IOWQ_AFF: 743 ret = -EINVAL; 744 if (!arg || !nr_args) 745 break; 746 ret = io_register_iowq_aff(ctx, arg, nr_args); 747 break; 748 case IORING_UNREGISTER_IOWQ_AFF: 749 ret = -EINVAL; 750 if (arg || nr_args) 751 break; 752 ret = io_unregister_iowq_aff(ctx); 753 break; 754 case IORING_REGISTER_IOWQ_MAX_WORKERS: 755 ret = -EINVAL; 756 if (!arg || nr_args != 2) 757 break; 758 ret = io_register_iowq_max_workers(ctx, arg); 759 break; 760 case IORING_REGISTER_RING_FDS: 761 ret = io_ringfd_register(ctx, arg, nr_args); 762 break; 763 case IORING_UNREGISTER_RING_FDS: 764 ret = io_ringfd_unregister(ctx, arg, nr_args); 765 break; 766 case IORING_REGISTER_PBUF_RING: 767 ret = -EINVAL; 768 if (!arg || nr_args != 1) 769 break; 770 ret = io_register_pbuf_ring(ctx, arg); 771 break; 772 case IORING_UNREGISTER_PBUF_RING: 773 ret = -EINVAL; 774 if (!arg || nr_args != 1) 775 break; 776 ret = io_unregister_pbuf_ring(ctx, arg); 777 break; 778 case IORING_REGISTER_SYNC_CANCEL: 779 ret = -EINVAL; 780 if (!arg || nr_args != 1) 781 break; 782 ret = io_sync_cancel(ctx, arg); 783 break; 784 case IORING_REGISTER_FILE_ALLOC_RANGE: 785 ret = -EINVAL; 786 if (!arg || nr_args) 787 break; 788 ret = io_register_file_alloc_range(ctx, arg); 789 break; 790 case IORING_REGISTER_PBUF_STATUS: 791 ret = -EINVAL; 792 if (!arg || nr_args != 1) 793 break; 794 ret = io_register_pbuf_status(ctx, arg); 795 break; 796 case IORING_REGISTER_NAPI: 797 ret = -EINVAL; 798 if (!arg || nr_args != 1) 799 break; 800 ret = io_register_napi(ctx, arg); 801 break; 802 case IORING_UNREGISTER_NAPI: 803 ret = -EINVAL; 804 if (nr_args != 1) 805 break; 806 ret = io_unregister_napi(ctx, arg); 807 break; 808 case IORING_REGISTER_CLOCK: 809 ret = -EINVAL; 810 if (!arg || nr_args) 811 break; 812 ret = io_register_clock(ctx, arg); 813 break; 814 case IORING_REGISTER_CLONE_BUFFERS: 815 ret = -EINVAL; 816 if (!arg || nr_args != 1) 817 break; 818 ret = io_register_clone_buffers(ctx, arg); 819 break; 820 case IORING_REGISTER_ZCRX_IFQ: 821 ret = -EINVAL; 822 if (!arg || nr_args != 1) 823 break; 824 ret = io_register_zcrx_ifq(ctx, arg); 825 break; 826 case IORING_REGISTER_RESIZE_RINGS: 827 ret = -EINVAL; 828 if (!arg || nr_args != 1) 829 break; 830 ret = io_register_resize_rings(ctx, arg); 831 break; 832 case IORING_REGISTER_MEM_REGION: 833 ret = -EINVAL; 834 if (!arg || nr_args != 1) 835 break; 836 ret = io_register_mem_region(ctx, arg); 837 break; 838 default: 839 ret = -EINVAL; 840 break; 841 } 842 843 return ret; 844 } 845 846 /* 847 * Given an 'fd' value, return the ctx associated with if. If 'registered' is 848 * true, then the registered index is used. Otherwise, the normal fd table. 849 * Caller must call fput() on the returned file, unless it's an ERR_PTR. 850 */ 851 struct file *io_uring_register_get_file(unsigned int fd, bool registered) 852 { 853 struct file *file; 854 855 if (registered) { 856 /* 857 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 858 * need only dereference our task private array to find it. 859 */ 860 struct io_uring_task *tctx = current->io_uring; 861 862 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 863 return ERR_PTR(-EINVAL); 864 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 865 file = tctx->registered_rings[fd]; 866 if (file) 867 get_file(file); 868 } else { 869 file = fget(fd); 870 } 871 872 if (unlikely(!file)) 873 return ERR_PTR(-EBADF); 874 if (io_is_uring_fops(file)) 875 return file; 876 fput(file); 877 return ERR_PTR(-EOPNOTSUPP); 878 } 879 880 /* 881 * "blind" registration opcodes are ones where there's no ring given, and 882 * hence the source fd must be -1. 883 */ 884 static int io_uring_register_blind(unsigned int opcode, void __user *arg, 885 unsigned int nr_args) 886 { 887 switch (opcode) { 888 case IORING_REGISTER_SEND_MSG_RING: { 889 struct io_uring_sqe sqe; 890 891 if (!arg || nr_args != 1) 892 return -EINVAL; 893 if (copy_from_user(&sqe, arg, sizeof(sqe))) 894 return -EFAULT; 895 /* no flags supported */ 896 if (sqe.flags) 897 return -EINVAL; 898 if (sqe.opcode == IORING_OP_MSG_RING) 899 return io_uring_sync_msg_ring(&sqe); 900 } 901 } 902 903 return -EINVAL; 904 } 905 906 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 907 void __user *, arg, unsigned int, nr_args) 908 { 909 struct io_ring_ctx *ctx; 910 long ret = -EBADF; 911 struct file *file; 912 bool use_registered_ring; 913 914 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 915 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 916 917 if (opcode >= IORING_REGISTER_LAST) 918 return -EINVAL; 919 920 if (fd == -1) 921 return io_uring_register_blind(opcode, arg, nr_args); 922 923 file = io_uring_register_get_file(fd, use_registered_ring); 924 if (IS_ERR(file)) 925 return PTR_ERR(file); 926 ctx = file->private_data; 927 928 mutex_lock(&ctx->uring_lock); 929 ret = __io_uring_register(ctx, opcode, arg, nr_args); 930 931 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, 932 ctx->buf_table.nr, ret); 933 mutex_unlock(&ctx->uring_lock); 934 935 fput(file); 936 return ret; 937 } 938