1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code related to the io_uring_register() syscall 4 * 5 * Copyright (C) 2023 Jens Axboe 6 */ 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/syscalls.h> 10 #include <linux/refcount.h> 11 #include <linux/bits.h> 12 #include <linux/fs.h> 13 #include <linux/file.h> 14 #include <linux/slab.h> 15 #include <linux/uaccess.h> 16 #include <linux/nospec.h> 17 #include <linux/compat.h> 18 #include <linux/io_uring.h> 19 #include <linux/io_uring_types.h> 20 21 #include "filetable.h" 22 #include "io_uring.h" 23 #include "opdef.h" 24 #include "tctx.h" 25 #include "rsrc.h" 26 #include "sqpoll.h" 27 #include "register.h" 28 #include "cancel.h" 29 #include "kbuf.h" 30 #include "napi.h" 31 #include "eventfd.h" 32 #include "msg_ring.h" 33 #include "memmap.h" 34 #include "zcrx.h" 35 #include "query.h" 36 #include "bpf_filter.h" 37 38 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 39 IORING_REGISTER_LAST + IORING_OP_LAST) 40 41 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 42 unsigned nr_args) 43 { 44 struct io_uring_probe *p; 45 size_t size; 46 int i, ret; 47 48 if (nr_args > IORING_OP_LAST) 49 nr_args = IORING_OP_LAST; 50 51 size = struct_size(p, ops, nr_args); 52 p = memdup_user(arg, size); 53 if (IS_ERR(p)) 54 return PTR_ERR(p); 55 ret = -EINVAL; 56 if (memchr_inv(p, 0, size)) 57 goto out; 58 59 p->last_op = IORING_OP_LAST - 1; 60 61 for (i = 0; i < nr_args; i++) { 62 p->ops[i].op = i; 63 if (io_uring_op_supported(i)) 64 p->ops[i].flags = IO_URING_OP_SUPPORTED; 65 } 66 p->ops_len = i; 67 68 ret = 0; 69 if (copy_to_user(arg, p, size)) 70 ret = -EFAULT; 71 out: 72 kfree(p); 73 return ret; 74 } 75 76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 77 { 78 const struct cred *creds; 79 80 creds = xa_erase(&ctx->personalities, id); 81 if (creds) { 82 put_cred(creds); 83 return 0; 84 } 85 86 return -EINVAL; 87 } 88 89 90 static int io_register_personality(struct io_ring_ctx *ctx) 91 { 92 const struct cred *creds; 93 u32 id; 94 int ret; 95 96 creds = get_current_cred(); 97 98 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 99 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 100 if (ret < 0) { 101 put_cred(creds); 102 return ret; 103 } 104 return id; 105 } 106 107 /* 108 * Returns number of restrictions parsed and added on success, or < 0 for 109 * an error. 110 */ 111 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, 112 struct io_restriction *restrictions) 113 { 114 struct io_uring_restriction *res; 115 size_t size; 116 int i, ret; 117 118 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 119 return -EINVAL; 120 121 size = array_size(nr_args, sizeof(*res)); 122 if (size == SIZE_MAX) 123 return -EOVERFLOW; 124 125 res = memdup_user(arg, size); 126 if (IS_ERR(res)) 127 return PTR_ERR(res); 128 129 ret = -EINVAL; 130 131 for (i = 0; i < nr_args; i++) { 132 switch (res[i].opcode) { 133 case IORING_RESTRICTION_REGISTER_OP: 134 if (res[i].register_op >= IORING_REGISTER_LAST) 135 goto err; 136 __set_bit(res[i].register_op, restrictions->register_op); 137 restrictions->reg_registered = true; 138 break; 139 case IORING_RESTRICTION_SQE_OP: 140 if (res[i].sqe_op >= IORING_OP_LAST) 141 goto err; 142 __set_bit(res[i].sqe_op, restrictions->sqe_op); 143 restrictions->op_registered = true; 144 break; 145 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 146 restrictions->sqe_flags_allowed = res[i].sqe_flags; 147 restrictions->op_registered = true; 148 break; 149 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 150 restrictions->sqe_flags_required = res[i].sqe_flags; 151 restrictions->op_registered = true; 152 break; 153 default: 154 goto err; 155 } 156 } 157 ret = nr_args; 158 if (!nr_args) { 159 restrictions->op_registered = true; 160 restrictions->reg_registered = true; 161 } 162 err: 163 kfree(res); 164 return ret; 165 } 166 167 static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 168 void __user *arg, unsigned int nr_args) 169 { 170 int ret; 171 172 /* Restrictions allowed only if rings started disabled */ 173 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 174 return -EBADFD; 175 176 /* We allow only a single restrictions registration */ 177 if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered) 178 return -EBUSY; 179 180 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); 181 /* Reset all restrictions if an error happened */ 182 if (ret < 0) { 183 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 184 return ret; 185 } 186 if (ctx->restrictions.op_registered) 187 ctx->op_restricted = 1; 188 if (ctx->restrictions.reg_registered) 189 ctx->reg_restricted = 1; 190 return 0; 191 } 192 193 static int io_register_restrictions_task(void __user *arg, unsigned int nr_args) 194 { 195 struct io_uring_task_restriction __user *ures = arg; 196 struct io_uring_task_restriction tres; 197 struct io_restriction *res; 198 int ret; 199 200 /* Disallow if task already has registered restrictions */ 201 if (current->io_uring_restrict) 202 return -EPERM; 203 /* 204 * Similar to seccomp, disallow setting a filter if task_no_new_privs 205 * is false and we're not CAP_SYS_ADMIN. 206 */ 207 if (!task_no_new_privs(current) && 208 !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) 209 return -EACCES; 210 if (nr_args != 1) 211 return -EINVAL; 212 213 if (copy_from_user(&tres, arg, sizeof(tres))) 214 return -EFAULT; 215 216 if (tres.flags) 217 return -EINVAL; 218 if (!mem_is_zero(tres.resv, sizeof(tres.resv))) 219 return -EINVAL; 220 221 res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT); 222 if (!res) 223 return -ENOMEM; 224 225 ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res); 226 if (ret < 0) { 227 kfree(res); 228 return ret; 229 } 230 current->io_uring_restrict = res; 231 return 0; 232 } 233 234 static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args) 235 { 236 struct io_restriction *res; 237 int ret; 238 239 /* 240 * Similar to seccomp, disallow setting a filter if task_no_new_privs 241 * is false and we're not CAP_SYS_ADMIN. 242 */ 243 if (!task_no_new_privs(current) && 244 !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) 245 return -EACCES; 246 247 if (nr_args != 1) 248 return -EINVAL; 249 250 /* If no task restrictions exist, setup a new set */ 251 res = current->io_uring_restrict; 252 if (!res) { 253 res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT); 254 if (!res) 255 return -ENOMEM; 256 } 257 258 ret = io_register_bpf_filter(res, arg); 259 if (ret) { 260 if (res != current->io_uring_restrict) 261 kfree(res); 262 return ret; 263 } 264 if (!current->io_uring_restrict) 265 current->io_uring_restrict = res; 266 return 0; 267 } 268 269 static int io_register_enable_rings(struct io_ring_ctx *ctx) 270 { 271 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 272 return -EBADFD; 273 274 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) { 275 ctx->submitter_task = get_task_struct(current); 276 /* 277 * Lazy activation attempts would fail if it was polled before 278 * submitter_task is set. 279 */ 280 if (wq_has_sleeper(&ctx->poll_wq)) 281 io_activate_pollwq(ctx); 282 } 283 284 /* Keep submitter_task store before clearing IORING_SETUP_R_DISABLED */ 285 smp_store_release(&ctx->flags, ctx->flags & ~IORING_SETUP_R_DISABLED); 286 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 287 wake_up(&ctx->sq_data->wait); 288 return 0; 289 } 290 291 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 292 cpumask_var_t new_mask) 293 { 294 int ret; 295 296 if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 297 ret = io_wq_cpu_affinity(current->io_uring, new_mask); 298 } else { 299 mutex_unlock(&ctx->uring_lock); 300 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 301 mutex_lock(&ctx->uring_lock); 302 } 303 304 return ret; 305 } 306 307 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 308 void __user *arg, unsigned len) 309 { 310 cpumask_var_t new_mask; 311 int ret; 312 313 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 314 return -ENOMEM; 315 316 cpumask_clear(new_mask); 317 if (len > cpumask_size()) 318 len = cpumask_size(); 319 320 #ifdef CONFIG_COMPAT 321 if (in_compat_syscall()) 322 ret = compat_get_bitmap(cpumask_bits(new_mask), 323 (const compat_ulong_t __user *)arg, 324 len * 8 /* CHAR_BIT */); 325 else 326 #endif 327 ret = copy_from_user(new_mask, arg, len); 328 329 if (ret) { 330 free_cpumask_var(new_mask); 331 return -EFAULT; 332 } 333 334 ret = __io_register_iowq_aff(ctx, new_mask); 335 free_cpumask_var(new_mask); 336 return ret; 337 } 338 339 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 340 { 341 return __io_register_iowq_aff(ctx, NULL); 342 } 343 344 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 345 void __user *arg) 346 __must_hold(&ctx->uring_lock) 347 { 348 struct io_tctx_node *node; 349 struct io_uring_task *tctx = NULL; 350 struct io_sq_data *sqd = NULL; 351 __u32 new_count[2]; 352 int i, ret; 353 354 if (copy_from_user(new_count, arg, sizeof(new_count))) 355 return -EFAULT; 356 for (i = 0; i < ARRAY_SIZE(new_count); i++) 357 if (new_count[i] > INT_MAX) 358 return -EINVAL; 359 360 if (ctx->flags & IORING_SETUP_SQPOLL) { 361 sqd = ctx->sq_data; 362 if (sqd) { 363 struct task_struct *tsk; 364 365 /* 366 * Observe the correct sqd->lock -> ctx->uring_lock 367 * ordering. Fine to drop uring_lock here, we hold 368 * a ref to the ctx. 369 */ 370 refcount_inc(&sqd->refs); 371 mutex_unlock(&ctx->uring_lock); 372 mutex_lock(&sqd->lock); 373 mutex_lock(&ctx->uring_lock); 374 tsk = sqpoll_task_locked(sqd); 375 if (tsk) 376 tctx = tsk->io_uring; 377 } 378 } else { 379 tctx = current->io_uring; 380 } 381 382 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 383 384 for (i = 0; i < ARRAY_SIZE(new_count); i++) 385 if (new_count[i]) 386 ctx->iowq_limits[i] = new_count[i]; 387 ctx->iowq_limits_set = true; 388 389 if (tctx && tctx->io_wq) { 390 ret = io_wq_max_workers(tctx->io_wq, new_count); 391 if (ret) 392 goto err; 393 } else { 394 memset(new_count, 0, sizeof(new_count)); 395 } 396 397 if (sqd) { 398 mutex_unlock(&ctx->uring_lock); 399 mutex_unlock(&sqd->lock); 400 io_put_sq_data(sqd); 401 mutex_lock(&ctx->uring_lock); 402 } 403 404 if (copy_to_user(arg, new_count, sizeof(new_count))) 405 return -EFAULT; 406 407 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 408 if (sqd) 409 return 0; 410 411 /* now propagate the restriction to all registered users */ 412 mutex_lock(&ctx->tctx_lock); 413 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 414 tctx = node->task->io_uring; 415 if (WARN_ON_ONCE(!tctx->io_wq)) 416 continue; 417 418 for (i = 0; i < ARRAY_SIZE(new_count); i++) 419 new_count[i] = ctx->iowq_limits[i]; 420 /* ignore errors, it always returns zero anyway */ 421 (void)io_wq_max_workers(tctx->io_wq, new_count); 422 } 423 mutex_unlock(&ctx->tctx_lock); 424 return 0; 425 err: 426 if (sqd) { 427 mutex_unlock(&ctx->uring_lock); 428 mutex_unlock(&sqd->lock); 429 io_put_sq_data(sqd); 430 mutex_lock(&ctx->uring_lock); 431 } 432 return ret; 433 } 434 435 static int io_register_clock(struct io_ring_ctx *ctx, 436 struct io_uring_clock_register __user *arg) 437 { 438 struct io_uring_clock_register reg; 439 440 if (copy_from_user(®, arg, sizeof(reg))) 441 return -EFAULT; 442 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) 443 return -EINVAL; 444 445 switch (reg.clockid) { 446 case CLOCK_MONOTONIC: 447 ctx->clock_offset = 0; 448 break; 449 case CLOCK_BOOTTIME: 450 ctx->clock_offset = TK_OFFS_BOOT; 451 break; 452 default: 453 return -EINVAL; 454 } 455 456 ctx->clockid = reg.clockid; 457 return 0; 458 } 459 460 /* 461 * State to maintain until we can swap. Both new and old state, used for 462 * either mapping or freeing. 463 */ 464 struct io_ring_ctx_rings { 465 struct io_rings *rings; 466 struct io_uring_sqe *sq_sqes; 467 468 struct io_mapped_region sq_region; 469 struct io_mapped_region ring_region; 470 }; 471 472 static void io_register_free_rings(struct io_ring_ctx *ctx, 473 struct io_ring_ctx_rings *r) 474 { 475 io_free_region(ctx->user, &r->sq_region); 476 io_free_region(ctx->user, &r->ring_region); 477 } 478 479 #define swap_old(ctx, o, n, field) \ 480 do { \ 481 (o).field = (ctx)->field; \ 482 (ctx)->field = (n).field; \ 483 } while (0) 484 485 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 486 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 487 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ 488 IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED) 489 490 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 491 { 492 struct io_ctx_config config; 493 struct io_uring_region_desc rd; 494 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; 495 unsigned i, tail, old_head; 496 struct io_uring_params *p = &config.p; 497 struct io_rings_layout *rl = &config.layout; 498 int ret; 499 500 memset(&config, 0, sizeof(config)); 501 502 /* limited to DEFER_TASKRUN for now */ 503 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 504 return -EINVAL; 505 if (copy_from_user(p, arg, sizeof(*p))) 506 return -EFAULT; 507 if (p->flags & ~RESIZE_FLAGS) 508 return -EINVAL; 509 510 /* properties that are always inherited */ 511 p->flags |= (ctx->flags & COPY_FLAGS); 512 513 ret = io_prepare_config(&config); 514 if (unlikely(ret)) 515 return ret; 516 517 memset(&rd, 0, sizeof(rd)); 518 rd.size = PAGE_ALIGN(rl->rings_size); 519 if (p->flags & IORING_SETUP_NO_MMAP) { 520 rd.user_addr = p->cq_off.user_addr; 521 rd.flags |= IORING_MEM_REGION_TYPE_USER; 522 } 523 ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); 524 if (ret) 525 return ret; 526 527 n.rings = io_region_get_ptr(&n.ring_region); 528 529 /* 530 * At this point n.rings is shared with userspace, just like o.rings 531 * is as well. While we don't expect userspace to modify it while 532 * a resize is in progress, and it's most likely that userspace will 533 * shoot itself in the foot if it does, we can't always assume good 534 * intent... Use read/write once helpers from here on to indicate the 535 * shared nature of it. 536 */ 537 WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1); 538 WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1); 539 WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries); 540 WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries); 541 542 if (copy_to_user(arg, p, sizeof(*p))) { 543 io_register_free_rings(ctx, &n); 544 return -EFAULT; 545 } 546 547 memset(&rd, 0, sizeof(rd)); 548 rd.size = PAGE_ALIGN(rl->sq_size); 549 if (p->flags & IORING_SETUP_NO_MMAP) { 550 rd.user_addr = p->sq_off.user_addr; 551 rd.flags |= IORING_MEM_REGION_TYPE_USER; 552 } 553 ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES); 554 if (ret) { 555 io_register_free_rings(ctx, &n); 556 return ret; 557 } 558 n.sq_sqes = io_region_get_ptr(&n.sq_region); 559 560 /* 561 * If using SQPOLL, park the thread 562 */ 563 if (ctx->sq_data) { 564 mutex_unlock(&ctx->uring_lock); 565 io_sq_thread_park(ctx->sq_data); 566 mutex_lock(&ctx->uring_lock); 567 } 568 569 /* 570 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude 571 * any new mmap's on the ring fd. Clear out existing mappings to prevent 572 * mmap from seeing them, as we'll unmap them. Any attempt to mmap 573 * existing rings beyond this point will fail. Not that it could proceed 574 * at this point anyway, as the io_uring mmap side needs go grab the 575 * ctx->mmap_lock as well. Likewise, hold the completion lock over the 576 * duration of the actual swap. 577 */ 578 mutex_lock(&ctx->mmap_lock); 579 spin_lock(&ctx->completion_lock); 580 o.rings = ctx->rings; 581 ctx->rings = NULL; 582 o.sq_sqes = ctx->sq_sqes; 583 ctx->sq_sqes = NULL; 584 585 /* 586 * Now copy SQ and CQ entries, if any. If either of the destination 587 * rings can't hold what is already there, then fail the operation. 588 */ 589 tail = READ_ONCE(o.rings->sq.tail); 590 old_head = READ_ONCE(o.rings->sq.head); 591 if (tail - old_head > p->sq_entries) 592 goto overflow; 593 for (i = old_head; i < tail; i++) { 594 unsigned src_head = i & (ctx->sq_entries - 1); 595 unsigned dst_head = i & (p->sq_entries - 1); 596 597 n.sq_sqes[dst_head] = o.sq_sqes[src_head]; 598 } 599 WRITE_ONCE(n.rings->sq.head, old_head); 600 WRITE_ONCE(n.rings->sq.tail, tail); 601 602 tail = READ_ONCE(o.rings->cq.tail); 603 old_head = READ_ONCE(o.rings->cq.head); 604 if (tail - old_head > p->cq_entries) { 605 overflow: 606 /* restore old rings, and return -EOVERFLOW via cleanup path */ 607 ctx->rings = o.rings; 608 ctx->sq_sqes = o.sq_sqes; 609 to_free = &n; 610 ret = -EOVERFLOW; 611 goto out; 612 } 613 for (i = old_head; i < tail; i++) { 614 unsigned src_head = i & (ctx->cq_entries - 1); 615 unsigned dst_head = i & (p->cq_entries - 1); 616 617 n.rings->cqes[dst_head] = o.rings->cqes[src_head]; 618 } 619 WRITE_ONCE(n.rings->cq.head, old_head); 620 WRITE_ONCE(n.rings->cq.tail, tail); 621 /* invalidate cached cqe refill */ 622 ctx->cqe_cached = ctx->cqe_sentinel = NULL; 623 624 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped)); 625 atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags)); 626 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags)); 627 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); 628 629 /* all done, store old pointers and assign new ones */ 630 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 631 ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset); 632 633 ctx->sq_entries = p->sq_entries; 634 ctx->cq_entries = p->cq_entries; 635 636 /* 637 * Just mark any flag we may have missed and that the application 638 * should act on unconditionally. Worst case it'll be an extra 639 * syscall. 640 */ 641 atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags); 642 ctx->rings = n.rings; 643 rcu_assign_pointer(ctx->rings_rcu, n.rings); 644 645 ctx->sq_sqes = n.sq_sqes; 646 swap_old(ctx, o, n, ring_region); 647 swap_old(ctx, o, n, sq_region); 648 to_free = &o; 649 ret = 0; 650 out: 651 spin_unlock(&ctx->completion_lock); 652 mutex_unlock(&ctx->mmap_lock); 653 /* Wait for concurrent io_ctx_mark_taskrun() */ 654 if (to_free == &o) 655 synchronize_rcu_expedited(); 656 io_register_free_rings(ctx, to_free); 657 658 if (ctx->sq_data) 659 io_sq_thread_unpark(ctx->sq_data); 660 661 return ret; 662 } 663 664 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) 665 { 666 struct io_uring_mem_region_reg __user *reg_uptr = uarg; 667 struct io_uring_mem_region_reg reg; 668 struct io_uring_region_desc __user *rd_uptr; 669 struct io_uring_region_desc rd; 670 struct io_mapped_region region = {}; 671 int ret; 672 673 if (io_region_is_set(&ctx->param_region)) 674 return -EBUSY; 675 if (copy_from_user(®, reg_uptr, sizeof(reg))) 676 return -EFAULT; 677 rd_uptr = u64_to_user_ptr(reg.region_uptr); 678 if (copy_from_user(&rd, rd_uptr, sizeof(rd))) 679 return -EFAULT; 680 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) 681 return -EINVAL; 682 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) 683 return -EINVAL; 684 685 /* 686 * This ensures there are no waiters. Waiters are unlocked and it's 687 * hard to synchronise with them, especially if we need to initialise 688 * the region. 689 */ 690 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) && 691 !(ctx->flags & IORING_SETUP_R_DISABLED)) 692 return -EINVAL; 693 694 ret = io_create_region(ctx, ®ion, &rd, IORING_MAP_OFF_PARAM_REGION); 695 if (ret) 696 return ret; 697 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { 698 io_free_region(ctx->user, ®ion); 699 return -EFAULT; 700 } 701 702 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { 703 ctx->cq_wait_arg = io_region_get_ptr(®ion); 704 ctx->cq_wait_size = rd.size; 705 } 706 707 io_region_publish(ctx, ®ion, &ctx->param_region); 708 return 0; 709 } 710 711 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 712 void __user *arg, unsigned nr_args) 713 __releases(ctx->uring_lock) 714 __acquires(ctx->uring_lock) 715 { 716 int ret; 717 718 /* 719 * We don't quiesce the refs for register anymore and so it can't be 720 * dying as we're holding a file ref here. 721 */ 722 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 723 return -ENXIO; 724 725 if (ctx->submitter_task && ctx->submitter_task != current) 726 return -EEXIST; 727 728 if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { 729 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 730 if (!test_bit(opcode, ctx->restrictions.register_op)) 731 return -EACCES; 732 } 733 734 switch (opcode) { 735 case IORING_REGISTER_BUFFERS: 736 ret = -EFAULT; 737 if (!arg) 738 break; 739 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 740 break; 741 case IORING_UNREGISTER_BUFFERS: 742 ret = -EINVAL; 743 if (arg || nr_args) 744 break; 745 ret = io_sqe_buffers_unregister(ctx); 746 break; 747 case IORING_REGISTER_FILES: 748 ret = -EFAULT; 749 if (!arg) 750 break; 751 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 752 break; 753 case IORING_UNREGISTER_FILES: 754 ret = -EINVAL; 755 if (arg || nr_args) 756 break; 757 ret = io_sqe_files_unregister(ctx); 758 break; 759 case IORING_REGISTER_FILES_UPDATE: 760 ret = io_register_files_update(ctx, arg, nr_args); 761 break; 762 case IORING_REGISTER_EVENTFD: 763 ret = -EINVAL; 764 if (nr_args != 1) 765 break; 766 ret = io_eventfd_register(ctx, arg, 0); 767 break; 768 case IORING_REGISTER_EVENTFD_ASYNC: 769 ret = -EINVAL; 770 if (nr_args != 1) 771 break; 772 ret = io_eventfd_register(ctx, arg, 1); 773 break; 774 case IORING_UNREGISTER_EVENTFD: 775 ret = -EINVAL; 776 if (arg || nr_args) 777 break; 778 ret = io_eventfd_unregister(ctx); 779 break; 780 case IORING_REGISTER_PROBE: 781 ret = -EINVAL; 782 if (!arg || nr_args > 256) 783 break; 784 ret = io_probe(ctx, arg, nr_args); 785 break; 786 case IORING_REGISTER_PERSONALITY: 787 ret = -EINVAL; 788 if (arg || nr_args) 789 break; 790 ret = io_register_personality(ctx); 791 break; 792 case IORING_UNREGISTER_PERSONALITY: 793 ret = -EINVAL; 794 if (arg) 795 break; 796 ret = io_unregister_personality(ctx, nr_args); 797 break; 798 case IORING_REGISTER_ENABLE_RINGS: 799 ret = -EINVAL; 800 if (arg || nr_args) 801 break; 802 ret = io_register_enable_rings(ctx); 803 break; 804 case IORING_REGISTER_RESTRICTIONS: 805 ret = io_register_restrictions(ctx, arg, nr_args); 806 break; 807 case IORING_REGISTER_FILES2: 808 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 809 break; 810 case IORING_REGISTER_FILES_UPDATE2: 811 ret = io_register_rsrc_update(ctx, arg, nr_args, 812 IORING_RSRC_FILE); 813 break; 814 case IORING_REGISTER_BUFFERS2: 815 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 816 break; 817 case IORING_REGISTER_BUFFERS_UPDATE: 818 ret = io_register_rsrc_update(ctx, arg, nr_args, 819 IORING_RSRC_BUFFER); 820 break; 821 case IORING_REGISTER_IOWQ_AFF: 822 ret = -EINVAL; 823 if (!arg || !nr_args) 824 break; 825 ret = io_register_iowq_aff(ctx, arg, nr_args); 826 break; 827 case IORING_UNREGISTER_IOWQ_AFF: 828 ret = -EINVAL; 829 if (arg || nr_args) 830 break; 831 ret = io_unregister_iowq_aff(ctx); 832 break; 833 case IORING_REGISTER_IOWQ_MAX_WORKERS: 834 ret = -EINVAL; 835 if (!arg || nr_args != 2) 836 break; 837 ret = io_register_iowq_max_workers(ctx, arg); 838 break; 839 case IORING_REGISTER_RING_FDS: 840 ret = io_ringfd_register(ctx, arg, nr_args); 841 break; 842 case IORING_UNREGISTER_RING_FDS: 843 ret = io_ringfd_unregister(ctx, arg, nr_args); 844 break; 845 case IORING_REGISTER_PBUF_RING: 846 ret = -EINVAL; 847 if (!arg || nr_args != 1) 848 break; 849 ret = io_register_pbuf_ring(ctx, arg); 850 break; 851 case IORING_UNREGISTER_PBUF_RING: 852 ret = -EINVAL; 853 if (!arg || nr_args != 1) 854 break; 855 ret = io_unregister_pbuf_ring(ctx, arg); 856 break; 857 case IORING_REGISTER_SYNC_CANCEL: 858 ret = -EINVAL; 859 if (!arg || nr_args != 1) 860 break; 861 ret = io_sync_cancel(ctx, arg); 862 break; 863 case IORING_REGISTER_FILE_ALLOC_RANGE: 864 ret = -EINVAL; 865 if (!arg || nr_args) 866 break; 867 ret = io_register_file_alloc_range(ctx, arg); 868 break; 869 case IORING_REGISTER_PBUF_STATUS: 870 ret = -EINVAL; 871 if (!arg || nr_args != 1) 872 break; 873 ret = io_register_pbuf_status(ctx, arg); 874 break; 875 case IORING_REGISTER_NAPI: 876 ret = -EINVAL; 877 if (!arg || nr_args != 1) 878 break; 879 ret = io_register_napi(ctx, arg); 880 break; 881 case IORING_UNREGISTER_NAPI: 882 ret = -EINVAL; 883 if (nr_args != 1) 884 break; 885 ret = io_unregister_napi(ctx, arg); 886 break; 887 case IORING_REGISTER_CLOCK: 888 ret = -EINVAL; 889 if (!arg || nr_args) 890 break; 891 ret = io_register_clock(ctx, arg); 892 break; 893 case IORING_REGISTER_CLONE_BUFFERS: 894 ret = -EINVAL; 895 if (!arg || nr_args != 1) 896 break; 897 ret = io_register_clone_buffers(ctx, arg); 898 break; 899 case IORING_REGISTER_ZCRX_IFQ: 900 ret = -EINVAL; 901 if (!arg || nr_args != 1) 902 break; 903 ret = io_register_zcrx_ifq(ctx, arg); 904 break; 905 case IORING_REGISTER_RESIZE_RINGS: 906 ret = -EINVAL; 907 if (!arg || nr_args != 1) 908 break; 909 ret = io_register_resize_rings(ctx, arg); 910 break; 911 case IORING_REGISTER_MEM_REGION: 912 ret = -EINVAL; 913 if (!arg || nr_args != 1) 914 break; 915 ret = io_register_mem_region(ctx, arg); 916 break; 917 case IORING_REGISTER_QUERY: 918 ret = io_query(arg, nr_args); 919 break; 920 case IORING_REGISTER_ZCRX_CTRL: 921 ret = io_zcrx_ctrl(ctx, arg, nr_args); 922 break; 923 case IORING_REGISTER_BPF_FILTER: 924 ret = -EINVAL; 925 926 if (nr_args != 1) 927 break; 928 ret = io_register_bpf_filter(&ctx->restrictions, arg); 929 if (!ret) 930 WRITE_ONCE(ctx->bpf_filters, 931 ctx->restrictions.bpf_filters->filters); 932 break; 933 default: 934 ret = -EINVAL; 935 break; 936 } 937 938 return ret; 939 } 940 941 /* 942 * Given an 'fd' value, return the ctx associated with if. If 'registered' is 943 * true, then the registered index is used. Otherwise, the normal fd table. 944 * Caller must call fput() on the returned file, unless it's an ERR_PTR. 945 */ 946 struct file *io_uring_register_get_file(unsigned int fd, bool registered) 947 { 948 struct file *file; 949 950 if (registered) { 951 /* 952 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 953 * need only dereference our task private array to find it. 954 */ 955 struct io_uring_task *tctx = current->io_uring; 956 957 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 958 return ERR_PTR(-EINVAL); 959 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 960 file = tctx->registered_rings[fd]; 961 if (file) 962 get_file(file); 963 } else { 964 file = fget(fd); 965 } 966 967 if (unlikely(!file)) 968 return ERR_PTR(-EBADF); 969 if (io_is_uring_fops(file)) 970 return file; 971 fput(file); 972 return ERR_PTR(-EOPNOTSUPP); 973 } 974 975 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args) 976 { 977 struct io_uring_sqe sqe; 978 979 if (!arg || nr_args != 1) 980 return -EINVAL; 981 if (copy_from_user(&sqe, arg, sizeof(sqe))) 982 return -EFAULT; 983 /* no flags supported */ 984 if (sqe.flags) 985 return -EINVAL; 986 if (sqe.opcode != IORING_OP_MSG_RING) 987 return -EINVAL; 988 989 return io_uring_sync_msg_ring(&sqe); 990 } 991 992 /* 993 * "blind" registration opcodes are ones where there's no ring given, and 994 * hence the source fd must be -1. 995 */ 996 static int io_uring_register_blind(unsigned int opcode, void __user *arg, 997 unsigned int nr_args) 998 { 999 switch (opcode) { 1000 case IORING_REGISTER_SEND_MSG_RING: 1001 return io_uring_register_send_msg_ring(arg, nr_args); 1002 case IORING_REGISTER_QUERY: 1003 return io_query(arg, nr_args); 1004 case IORING_REGISTER_RESTRICTIONS: 1005 return io_register_restrictions_task(arg, nr_args); 1006 case IORING_REGISTER_BPF_FILTER: 1007 return io_register_bpf_filter_task(arg, nr_args); 1008 } 1009 return -EINVAL; 1010 } 1011 1012 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 1013 void __user *, arg, unsigned int, nr_args) 1014 { 1015 struct io_ring_ctx *ctx; 1016 long ret = -EBADF; 1017 struct file *file; 1018 bool use_registered_ring; 1019 1020 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 1021 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 1022 1023 if (opcode >= IORING_REGISTER_LAST) 1024 return -EINVAL; 1025 1026 if (fd == -1) 1027 return io_uring_register_blind(opcode, arg, nr_args); 1028 1029 file = io_uring_register_get_file(fd, use_registered_ring); 1030 if (IS_ERR(file)) 1031 return PTR_ERR(file); 1032 ctx = file->private_data; 1033 1034 mutex_lock(&ctx->uring_lock); 1035 ret = __io_uring_register(ctx, opcode, arg, nr_args); 1036 1037 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, 1038 ctx->buf_table.nr, ret); 1039 mutex_unlock(&ctx->uring_lock); 1040 1041 fput(file); 1042 return ret; 1043 } 1044