1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code related to the io_uring_register() syscall 4 * 5 * Copyright (C) 2023 Jens Axboe 6 */ 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/syscalls.h> 10 #include <linux/refcount.h> 11 #include <linux/bits.h> 12 #include <linux/fs.h> 13 #include <linux/file.h> 14 #include <linux/slab.h> 15 #include <linux/uaccess.h> 16 #include <linux/nospec.h> 17 #include <linux/compat.h> 18 #include <linux/io_uring.h> 19 #include <linux/io_uring_types.h> 20 21 #include "io_uring.h" 22 #include "opdef.h" 23 #include "tctx.h" 24 #include "rsrc.h" 25 #include "sqpoll.h" 26 #include "register.h" 27 #include "cancel.h" 28 #include "kbuf.h" 29 #include "napi.h" 30 #include "eventfd.h" 31 #include "msg_ring.h" 32 #include "memmap.h" 33 34 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 35 IORING_REGISTER_LAST + IORING_OP_LAST) 36 37 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 38 unsigned nr_args) 39 { 40 struct io_uring_probe *p; 41 size_t size; 42 int i, ret; 43 44 if (nr_args > IORING_OP_LAST) 45 nr_args = IORING_OP_LAST; 46 47 size = struct_size(p, ops, nr_args); 48 p = kzalloc(size, GFP_KERNEL); 49 if (!p) 50 return -ENOMEM; 51 52 ret = -EFAULT; 53 if (copy_from_user(p, arg, size)) 54 goto out; 55 ret = -EINVAL; 56 if (memchr_inv(p, 0, size)) 57 goto out; 58 59 p->last_op = IORING_OP_LAST - 1; 60 61 for (i = 0; i < nr_args; i++) { 62 p->ops[i].op = i; 63 if (io_uring_op_supported(i)) 64 p->ops[i].flags = IO_URING_OP_SUPPORTED; 65 } 66 p->ops_len = i; 67 68 ret = 0; 69 if (copy_to_user(arg, p, size)) 70 ret = -EFAULT; 71 out: 72 kfree(p); 73 return ret; 74 } 75 76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 77 { 78 const struct cred *creds; 79 80 creds = xa_erase(&ctx->personalities, id); 81 if (creds) { 82 put_cred(creds); 83 return 0; 84 } 85 86 return -EINVAL; 87 } 88 89 90 static int io_register_personality(struct io_ring_ctx *ctx) 91 { 92 const struct cred *creds; 93 u32 id; 94 int ret; 95 96 creds = get_current_cred(); 97 98 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 99 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 100 if (ret < 0) { 101 put_cred(creds); 102 return ret; 103 } 104 return id; 105 } 106 107 static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 108 void __user *arg, unsigned int nr_args) 109 { 110 struct io_uring_restriction *res; 111 size_t size; 112 int i, ret; 113 114 /* Restrictions allowed only if rings started disabled */ 115 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 116 return -EBADFD; 117 118 /* We allow only a single restrictions registration */ 119 if (ctx->restrictions.registered) 120 return -EBUSY; 121 122 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 123 return -EINVAL; 124 125 size = array_size(nr_args, sizeof(*res)); 126 if (size == SIZE_MAX) 127 return -EOVERFLOW; 128 129 res = memdup_user(arg, size); 130 if (IS_ERR(res)) 131 return PTR_ERR(res); 132 133 ret = 0; 134 135 for (i = 0; i < nr_args; i++) { 136 switch (res[i].opcode) { 137 case IORING_RESTRICTION_REGISTER_OP: 138 if (res[i].register_op >= IORING_REGISTER_LAST) { 139 ret = -EINVAL; 140 goto out; 141 } 142 143 __set_bit(res[i].register_op, 144 ctx->restrictions.register_op); 145 break; 146 case IORING_RESTRICTION_SQE_OP: 147 if (res[i].sqe_op >= IORING_OP_LAST) { 148 ret = -EINVAL; 149 goto out; 150 } 151 152 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 153 break; 154 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 155 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 156 break; 157 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 158 ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 159 break; 160 default: 161 ret = -EINVAL; 162 goto out; 163 } 164 } 165 166 out: 167 /* Reset all restrictions if an error happened */ 168 if (ret != 0) 169 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 170 else 171 ctx->restrictions.registered = true; 172 173 kfree(res); 174 return ret; 175 } 176 177 static int io_register_enable_rings(struct io_ring_ctx *ctx) 178 { 179 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 180 return -EBADFD; 181 182 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { 183 WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 184 /* 185 * Lazy activation attempts would fail if it was polled before 186 * submitter_task is set. 187 */ 188 if (wq_has_sleeper(&ctx->poll_wq)) 189 io_activate_pollwq(ctx); 190 } 191 192 if (ctx->restrictions.registered) 193 ctx->restricted = 1; 194 195 ctx->flags &= ~IORING_SETUP_R_DISABLED; 196 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 197 wake_up(&ctx->sq_data->wait); 198 return 0; 199 } 200 201 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 202 cpumask_var_t new_mask) 203 { 204 int ret; 205 206 if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 207 ret = io_wq_cpu_affinity(current->io_uring, new_mask); 208 } else { 209 mutex_unlock(&ctx->uring_lock); 210 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 211 mutex_lock(&ctx->uring_lock); 212 } 213 214 return ret; 215 } 216 217 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 218 void __user *arg, unsigned len) 219 { 220 cpumask_var_t new_mask; 221 int ret; 222 223 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 224 return -ENOMEM; 225 226 cpumask_clear(new_mask); 227 if (len > cpumask_size()) 228 len = cpumask_size(); 229 230 #ifdef CONFIG_COMPAT 231 if (in_compat_syscall()) 232 ret = compat_get_bitmap(cpumask_bits(new_mask), 233 (const compat_ulong_t __user *)arg, 234 len * 8 /* CHAR_BIT */); 235 else 236 #endif 237 ret = copy_from_user(new_mask, arg, len); 238 239 if (ret) { 240 free_cpumask_var(new_mask); 241 return -EFAULT; 242 } 243 244 ret = __io_register_iowq_aff(ctx, new_mask); 245 free_cpumask_var(new_mask); 246 return ret; 247 } 248 249 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 250 { 251 return __io_register_iowq_aff(ctx, NULL); 252 } 253 254 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 255 void __user *arg) 256 __must_hold(&ctx->uring_lock) 257 { 258 struct io_tctx_node *node; 259 struct io_uring_task *tctx = NULL; 260 struct io_sq_data *sqd = NULL; 261 __u32 new_count[2]; 262 int i, ret; 263 264 if (copy_from_user(new_count, arg, sizeof(new_count))) 265 return -EFAULT; 266 for (i = 0; i < ARRAY_SIZE(new_count); i++) 267 if (new_count[i] > INT_MAX) 268 return -EINVAL; 269 270 if (ctx->flags & IORING_SETUP_SQPOLL) { 271 sqd = ctx->sq_data; 272 if (sqd) { 273 /* 274 * Observe the correct sqd->lock -> ctx->uring_lock 275 * ordering. Fine to drop uring_lock here, we hold 276 * a ref to the ctx. 277 */ 278 refcount_inc(&sqd->refs); 279 mutex_unlock(&ctx->uring_lock); 280 mutex_lock(&sqd->lock); 281 mutex_lock(&ctx->uring_lock); 282 if (sqd->thread) 283 tctx = sqd->thread->io_uring; 284 } 285 } else { 286 tctx = current->io_uring; 287 } 288 289 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 290 291 for (i = 0; i < ARRAY_SIZE(new_count); i++) 292 if (new_count[i]) 293 ctx->iowq_limits[i] = new_count[i]; 294 ctx->iowq_limits_set = true; 295 296 if (tctx && tctx->io_wq) { 297 ret = io_wq_max_workers(tctx->io_wq, new_count); 298 if (ret) 299 goto err; 300 } else { 301 memset(new_count, 0, sizeof(new_count)); 302 } 303 304 if (sqd) { 305 mutex_unlock(&ctx->uring_lock); 306 mutex_unlock(&sqd->lock); 307 io_put_sq_data(sqd); 308 mutex_lock(&ctx->uring_lock); 309 } 310 311 if (copy_to_user(arg, new_count, sizeof(new_count))) 312 return -EFAULT; 313 314 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 315 if (sqd) 316 return 0; 317 318 /* now propagate the restriction to all registered users */ 319 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 320 tctx = node->task->io_uring; 321 if (WARN_ON_ONCE(!tctx->io_wq)) 322 continue; 323 324 for (i = 0; i < ARRAY_SIZE(new_count); i++) 325 new_count[i] = ctx->iowq_limits[i]; 326 /* ignore errors, it always returns zero anyway */ 327 (void)io_wq_max_workers(tctx->io_wq, new_count); 328 } 329 return 0; 330 err: 331 if (sqd) { 332 mutex_unlock(&ctx->uring_lock); 333 mutex_unlock(&sqd->lock); 334 io_put_sq_data(sqd); 335 mutex_lock(&ctx->uring_lock); 336 } 337 return ret; 338 } 339 340 static int io_register_clock(struct io_ring_ctx *ctx, 341 struct io_uring_clock_register __user *arg) 342 { 343 struct io_uring_clock_register reg; 344 345 if (copy_from_user(®, arg, sizeof(reg))) 346 return -EFAULT; 347 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) 348 return -EINVAL; 349 350 switch (reg.clockid) { 351 case CLOCK_MONOTONIC: 352 ctx->clock_offset = 0; 353 break; 354 case CLOCK_BOOTTIME: 355 ctx->clock_offset = TK_OFFS_BOOT; 356 break; 357 default: 358 return -EINVAL; 359 } 360 361 ctx->clockid = reg.clockid; 362 return 0; 363 } 364 365 /* 366 * State to maintain until we can swap. Both new and old state, used for 367 * either mapping or freeing. 368 */ 369 struct io_ring_ctx_rings { 370 unsigned short n_ring_pages; 371 unsigned short n_sqe_pages; 372 struct page **ring_pages; 373 struct page **sqe_pages; 374 struct io_uring_sqe *sq_sqes; 375 struct io_rings *rings; 376 }; 377 378 static void io_register_free_rings(struct io_uring_params *p, 379 struct io_ring_ctx_rings *r) 380 { 381 if (!(p->flags & IORING_SETUP_NO_MMAP)) { 382 io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, 383 true); 384 io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages, 385 true); 386 } else { 387 io_pages_free(&r->ring_pages, r->n_ring_pages); 388 io_pages_free(&r->sqe_pages, r->n_sqe_pages); 389 vunmap(r->rings); 390 vunmap(r->sq_sqes); 391 } 392 } 393 394 #define swap_old(ctx, o, n, field) \ 395 do { \ 396 (o).field = (ctx)->field; \ 397 (ctx)->field = (n).field; \ 398 } while (0) 399 400 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 401 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 402 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) 403 404 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 405 { 406 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; 407 size_t size, sq_array_offset; 408 unsigned i, tail, old_head; 409 struct io_uring_params p; 410 void *ptr; 411 int ret; 412 413 /* for single issuer, must be owner resizing */ 414 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && 415 current != ctx->submitter_task) 416 return -EEXIST; 417 /* limited to DEFER_TASKRUN for now */ 418 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 419 return -EINVAL; 420 if (copy_from_user(&p, arg, sizeof(p))) 421 return -EFAULT; 422 if (p.flags & ~RESIZE_FLAGS) 423 return -EINVAL; 424 425 /* properties that are always inherited */ 426 p.flags |= (ctx->flags & COPY_FLAGS); 427 428 ret = io_uring_fill_params(p.sq_entries, &p); 429 if (unlikely(ret)) 430 return ret; 431 432 /* nothing to do, but copy params back */ 433 if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { 434 if (copy_to_user(arg, &p, sizeof(p))) 435 return -EFAULT; 436 return 0; 437 } 438 439 size = rings_size(p.flags, p.sq_entries, p.cq_entries, 440 &sq_array_offset); 441 if (size == SIZE_MAX) 442 return -EOVERFLOW; 443 444 if (!(p.flags & IORING_SETUP_NO_MMAP)) 445 n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size); 446 else 447 n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages, 448 p.cq_off.user_addr, size); 449 if (IS_ERR(n.rings)) 450 return PTR_ERR(n.rings); 451 452 /* 453 * At this point n.rings is shared with userspace, just like o.rings 454 * is as well. While we don't expect userspace to modify it while 455 * a resize is in progress, and it's most likely that userspace will 456 * shoot itself in the foot if it does, we can't always assume good 457 * intent... Use read/write once helpers from here on to indicate the 458 * shared nature of it. 459 */ 460 WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); 461 WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); 462 WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); 463 WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); 464 465 if (copy_to_user(arg, &p, sizeof(p))) { 466 io_register_free_rings(&p, &n); 467 return -EFAULT; 468 } 469 470 if (p.flags & IORING_SETUP_SQE128) 471 size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); 472 else 473 size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); 474 if (size == SIZE_MAX) { 475 io_register_free_rings(&p, &n); 476 return -EOVERFLOW; 477 } 478 479 if (!(p.flags & IORING_SETUP_NO_MMAP)) 480 ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size); 481 else 482 ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages, 483 p.sq_off.user_addr, 484 size); 485 if (IS_ERR(ptr)) { 486 io_register_free_rings(&p, &n); 487 return PTR_ERR(ptr); 488 } 489 490 /* 491 * If using SQPOLL, park the thread 492 */ 493 if (ctx->sq_data) { 494 mutex_unlock(&ctx->uring_lock); 495 io_sq_thread_park(ctx->sq_data); 496 mutex_lock(&ctx->uring_lock); 497 } 498 499 /* 500 * We'll do the swap. Grab the ctx->resize_lock, which will exclude 501 * any new mmap's on the ring fd. Clear out existing mappings to prevent 502 * mmap from seeing them, as we'll unmap them. Any attempt to mmap 503 * existing rings beyond this point will fail. Not that it could proceed 504 * at this point anyway, as the io_uring mmap side needs go grab the 505 * ctx->resize_lock as well. Likewise, hold the completion lock over the 506 * duration of the actual swap. 507 */ 508 mutex_lock(&ctx->resize_lock); 509 spin_lock(&ctx->completion_lock); 510 o.rings = ctx->rings; 511 ctx->rings = NULL; 512 o.sq_sqes = ctx->sq_sqes; 513 ctx->sq_sqes = NULL; 514 515 /* 516 * Now copy SQ and CQ entries, if any. If either of the destination 517 * rings can't hold what is already there, then fail the operation. 518 */ 519 n.sq_sqes = ptr; 520 tail = READ_ONCE(o.rings->sq.tail); 521 old_head = READ_ONCE(o.rings->sq.head); 522 if (tail - old_head > p.sq_entries) 523 goto overflow; 524 for (i = old_head; i < tail; i++) { 525 unsigned src_head = i & (ctx->sq_entries - 1); 526 unsigned dst_head = i & (p.sq_entries - 1); 527 528 n.sq_sqes[dst_head] = o.sq_sqes[src_head]; 529 } 530 WRITE_ONCE(n.rings->sq.head, READ_ONCE(o.rings->sq.head)); 531 WRITE_ONCE(n.rings->sq.tail, READ_ONCE(o.rings->sq.tail)); 532 533 tail = READ_ONCE(o.rings->cq.tail); 534 old_head = READ_ONCE(o.rings->cq.head); 535 if (tail - old_head > p.cq_entries) { 536 overflow: 537 /* restore old rings, and return -EOVERFLOW via cleanup path */ 538 ctx->rings = o.rings; 539 ctx->sq_sqes = o.sq_sqes; 540 to_free = &n; 541 ret = -EOVERFLOW; 542 goto out; 543 } 544 for (i = old_head; i < tail; i++) { 545 unsigned src_head = i & (ctx->cq_entries - 1); 546 unsigned dst_head = i & (p.cq_entries - 1); 547 548 n.rings->cqes[dst_head] = o.rings->cqes[src_head]; 549 } 550 WRITE_ONCE(n.rings->cq.head, READ_ONCE(o.rings->cq.head)); 551 WRITE_ONCE(n.rings->cq.tail, READ_ONCE(o.rings->cq.tail)); 552 /* invalidate cached cqe refill */ 553 ctx->cqe_cached = ctx->cqe_sentinel = NULL; 554 555 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped)); 556 WRITE_ONCE(n.rings->sq_flags, READ_ONCE(o.rings->sq_flags)); 557 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags)); 558 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); 559 560 /* all done, store old pointers and assign new ones */ 561 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 562 ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); 563 564 ctx->sq_entries = p.sq_entries; 565 ctx->cq_entries = p.cq_entries; 566 567 ctx->rings = n.rings; 568 ctx->sq_sqes = n.sq_sqes; 569 swap_old(ctx, o, n, n_ring_pages); 570 swap_old(ctx, o, n, n_sqe_pages); 571 swap_old(ctx, o, n, ring_pages); 572 swap_old(ctx, o, n, sqe_pages); 573 to_free = &o; 574 ret = 0; 575 out: 576 spin_unlock(&ctx->completion_lock); 577 mutex_unlock(&ctx->resize_lock); 578 io_register_free_rings(&p, to_free); 579 580 if (ctx->sq_data) 581 io_sq_thread_unpark(ctx->sq_data); 582 583 return ret; 584 } 585 586 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) 587 { 588 struct io_uring_mem_region_reg __user *reg_uptr = uarg; 589 struct io_uring_mem_region_reg reg; 590 struct io_uring_region_desc __user *rd_uptr; 591 struct io_uring_region_desc rd; 592 int ret; 593 594 if (io_region_is_set(&ctx->param_region)) 595 return -EBUSY; 596 if (copy_from_user(®, reg_uptr, sizeof(reg))) 597 return -EFAULT; 598 rd_uptr = u64_to_user_ptr(reg.region_uptr); 599 if (copy_from_user(&rd, rd_uptr, sizeof(rd))) 600 return -EFAULT; 601 602 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) 603 return -EINVAL; 604 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) 605 return -EINVAL; 606 607 /* 608 * This ensures there are no waiters. Waiters are unlocked and it's 609 * hard to synchronise with them, especially if we need to initialise 610 * the region. 611 */ 612 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) && 613 !(ctx->flags & IORING_SETUP_R_DISABLED)) 614 return -EINVAL; 615 616 ret = io_create_region(ctx, &ctx->param_region, &rd); 617 if (ret) 618 return ret; 619 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { 620 io_free_region(ctx, &ctx->param_region); 621 return -EFAULT; 622 } 623 624 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { 625 ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); 626 ctx->cq_wait_size = rd.size; 627 } 628 return 0; 629 } 630 631 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 632 void __user *arg, unsigned nr_args) 633 __releases(ctx->uring_lock) 634 __acquires(ctx->uring_lock) 635 { 636 int ret; 637 638 /* 639 * We don't quiesce the refs for register anymore and so it can't be 640 * dying as we're holding a file ref here. 641 */ 642 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 643 return -ENXIO; 644 645 if (ctx->submitter_task && ctx->submitter_task != current) 646 return -EEXIST; 647 648 if (ctx->restricted) { 649 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 650 if (!test_bit(opcode, ctx->restrictions.register_op)) 651 return -EACCES; 652 } 653 654 switch (opcode) { 655 case IORING_REGISTER_BUFFERS: 656 ret = -EFAULT; 657 if (!arg) 658 break; 659 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 660 break; 661 case IORING_UNREGISTER_BUFFERS: 662 ret = -EINVAL; 663 if (arg || nr_args) 664 break; 665 ret = io_sqe_buffers_unregister(ctx); 666 break; 667 case IORING_REGISTER_FILES: 668 ret = -EFAULT; 669 if (!arg) 670 break; 671 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 672 break; 673 case IORING_UNREGISTER_FILES: 674 ret = -EINVAL; 675 if (arg || nr_args) 676 break; 677 ret = io_sqe_files_unregister(ctx); 678 break; 679 case IORING_REGISTER_FILES_UPDATE: 680 ret = io_register_files_update(ctx, arg, nr_args); 681 break; 682 case IORING_REGISTER_EVENTFD: 683 ret = -EINVAL; 684 if (nr_args != 1) 685 break; 686 ret = io_eventfd_register(ctx, arg, 0); 687 break; 688 case IORING_REGISTER_EVENTFD_ASYNC: 689 ret = -EINVAL; 690 if (nr_args != 1) 691 break; 692 ret = io_eventfd_register(ctx, arg, 1); 693 break; 694 case IORING_UNREGISTER_EVENTFD: 695 ret = -EINVAL; 696 if (arg || nr_args) 697 break; 698 ret = io_eventfd_unregister(ctx); 699 break; 700 case IORING_REGISTER_PROBE: 701 ret = -EINVAL; 702 if (!arg || nr_args > 256) 703 break; 704 ret = io_probe(ctx, arg, nr_args); 705 break; 706 case IORING_REGISTER_PERSONALITY: 707 ret = -EINVAL; 708 if (arg || nr_args) 709 break; 710 ret = io_register_personality(ctx); 711 break; 712 case IORING_UNREGISTER_PERSONALITY: 713 ret = -EINVAL; 714 if (arg) 715 break; 716 ret = io_unregister_personality(ctx, nr_args); 717 break; 718 case IORING_REGISTER_ENABLE_RINGS: 719 ret = -EINVAL; 720 if (arg || nr_args) 721 break; 722 ret = io_register_enable_rings(ctx); 723 break; 724 case IORING_REGISTER_RESTRICTIONS: 725 ret = io_register_restrictions(ctx, arg, nr_args); 726 break; 727 case IORING_REGISTER_FILES2: 728 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 729 break; 730 case IORING_REGISTER_FILES_UPDATE2: 731 ret = io_register_rsrc_update(ctx, arg, nr_args, 732 IORING_RSRC_FILE); 733 break; 734 case IORING_REGISTER_BUFFERS2: 735 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 736 break; 737 case IORING_REGISTER_BUFFERS_UPDATE: 738 ret = io_register_rsrc_update(ctx, arg, nr_args, 739 IORING_RSRC_BUFFER); 740 break; 741 case IORING_REGISTER_IOWQ_AFF: 742 ret = -EINVAL; 743 if (!arg || !nr_args) 744 break; 745 ret = io_register_iowq_aff(ctx, arg, nr_args); 746 break; 747 case IORING_UNREGISTER_IOWQ_AFF: 748 ret = -EINVAL; 749 if (arg || nr_args) 750 break; 751 ret = io_unregister_iowq_aff(ctx); 752 break; 753 case IORING_REGISTER_IOWQ_MAX_WORKERS: 754 ret = -EINVAL; 755 if (!arg || nr_args != 2) 756 break; 757 ret = io_register_iowq_max_workers(ctx, arg); 758 break; 759 case IORING_REGISTER_RING_FDS: 760 ret = io_ringfd_register(ctx, arg, nr_args); 761 break; 762 case IORING_UNREGISTER_RING_FDS: 763 ret = io_ringfd_unregister(ctx, arg, nr_args); 764 break; 765 case IORING_REGISTER_PBUF_RING: 766 ret = -EINVAL; 767 if (!arg || nr_args != 1) 768 break; 769 ret = io_register_pbuf_ring(ctx, arg); 770 break; 771 case IORING_UNREGISTER_PBUF_RING: 772 ret = -EINVAL; 773 if (!arg || nr_args != 1) 774 break; 775 ret = io_unregister_pbuf_ring(ctx, arg); 776 break; 777 case IORING_REGISTER_SYNC_CANCEL: 778 ret = -EINVAL; 779 if (!arg || nr_args != 1) 780 break; 781 ret = io_sync_cancel(ctx, arg); 782 break; 783 case IORING_REGISTER_FILE_ALLOC_RANGE: 784 ret = -EINVAL; 785 if (!arg || nr_args) 786 break; 787 ret = io_register_file_alloc_range(ctx, arg); 788 break; 789 case IORING_REGISTER_PBUF_STATUS: 790 ret = -EINVAL; 791 if (!arg || nr_args != 1) 792 break; 793 ret = io_register_pbuf_status(ctx, arg); 794 break; 795 case IORING_REGISTER_NAPI: 796 ret = -EINVAL; 797 if (!arg || nr_args != 1) 798 break; 799 ret = io_register_napi(ctx, arg); 800 break; 801 case IORING_UNREGISTER_NAPI: 802 ret = -EINVAL; 803 if (nr_args != 1) 804 break; 805 ret = io_unregister_napi(ctx, arg); 806 break; 807 case IORING_REGISTER_CLOCK: 808 ret = -EINVAL; 809 if (!arg || nr_args) 810 break; 811 ret = io_register_clock(ctx, arg); 812 break; 813 case IORING_REGISTER_CLONE_BUFFERS: 814 ret = -EINVAL; 815 if (!arg || nr_args != 1) 816 break; 817 ret = io_register_clone_buffers(ctx, arg); 818 break; 819 case IORING_REGISTER_RESIZE_RINGS: 820 ret = -EINVAL; 821 if (!arg || nr_args != 1) 822 break; 823 ret = io_register_resize_rings(ctx, arg); 824 break; 825 case IORING_REGISTER_MEM_REGION: 826 ret = -EINVAL; 827 if (!arg || nr_args != 1) 828 break; 829 ret = io_register_mem_region(ctx, arg); 830 break; 831 default: 832 ret = -EINVAL; 833 break; 834 } 835 836 return ret; 837 } 838 839 /* 840 * Given an 'fd' value, return the ctx associated with if. If 'registered' is 841 * true, then the registered index is used. Otherwise, the normal fd table. 842 * Caller must call fput() on the returned file, unless it's an ERR_PTR. 843 */ 844 struct file *io_uring_register_get_file(unsigned int fd, bool registered) 845 { 846 struct file *file; 847 848 if (registered) { 849 /* 850 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 851 * need only dereference our task private array to find it. 852 */ 853 struct io_uring_task *tctx = current->io_uring; 854 855 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 856 return ERR_PTR(-EINVAL); 857 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 858 file = tctx->registered_rings[fd]; 859 } else { 860 file = fget(fd); 861 } 862 863 if (unlikely(!file)) 864 return ERR_PTR(-EBADF); 865 if (io_is_uring_fops(file)) 866 return file; 867 fput(file); 868 return ERR_PTR(-EOPNOTSUPP); 869 } 870 871 /* 872 * "blind" registration opcodes are ones where there's no ring given, and 873 * hence the source fd must be -1. 874 */ 875 static int io_uring_register_blind(unsigned int opcode, void __user *arg, 876 unsigned int nr_args) 877 { 878 switch (opcode) { 879 case IORING_REGISTER_SEND_MSG_RING: { 880 struct io_uring_sqe sqe; 881 882 if (!arg || nr_args != 1) 883 return -EINVAL; 884 if (copy_from_user(&sqe, arg, sizeof(sqe))) 885 return -EFAULT; 886 /* no flags supported */ 887 if (sqe.flags) 888 return -EINVAL; 889 if (sqe.opcode == IORING_OP_MSG_RING) 890 return io_uring_sync_msg_ring(&sqe); 891 } 892 } 893 894 return -EINVAL; 895 } 896 897 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 898 void __user *, arg, unsigned int, nr_args) 899 { 900 struct io_ring_ctx *ctx; 901 long ret = -EBADF; 902 struct file *file; 903 bool use_registered_ring; 904 905 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 906 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 907 908 if (opcode >= IORING_REGISTER_LAST) 909 return -EINVAL; 910 911 if (fd == -1) 912 return io_uring_register_blind(opcode, arg, nr_args); 913 914 file = io_uring_register_get_file(fd, use_registered_ring); 915 if (IS_ERR(file)) 916 return PTR_ERR(file); 917 ctx = file->private_data; 918 919 mutex_lock(&ctx->uring_lock); 920 ret = __io_uring_register(ctx, opcode, arg, nr_args); 921 922 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, 923 ctx->buf_table.nr, ret); 924 mutex_unlock(&ctx->uring_lock); 925 if (!use_registered_ring) 926 fput(file); 927 return ret; 928 } 929