1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code related to the io_uring_register() syscall 4 * 5 * Copyright (C) 2023 Jens Axboe 6 */ 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/syscalls.h> 10 #include <linux/refcount.h> 11 #include <linux/bits.h> 12 #include <linux/fs.h> 13 #include <linux/file.h> 14 #include <linux/slab.h> 15 #include <linux/uaccess.h> 16 #include <linux/nospec.h> 17 #include <linux/compat.h> 18 #include <linux/io_uring.h> 19 #include <linux/io_uring_types.h> 20 21 #include "filetable.h" 22 #include "io_uring.h" 23 #include "opdef.h" 24 #include "tctx.h" 25 #include "rsrc.h" 26 #include "sqpoll.h" 27 #include "register.h" 28 #include "cancel.h" 29 #include "kbuf.h" 30 #include "napi.h" 31 #include "eventfd.h" 32 #include "msg_ring.h" 33 #include "memmap.h" 34 #include "zcrx.h" 35 #include "query.h" 36 #include "bpf_filter.h" 37 38 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 39 IORING_REGISTER_LAST + IORING_OP_LAST) 40 41 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 42 unsigned nr_args) 43 { 44 struct io_uring_probe *p; 45 size_t size; 46 int i, ret; 47 48 if (nr_args > IORING_OP_LAST) 49 nr_args = IORING_OP_LAST; 50 51 size = struct_size(p, ops, nr_args); 52 p = memdup_user(arg, size); 53 if (IS_ERR(p)) 54 return PTR_ERR(p); 55 ret = -EINVAL; 56 if (memchr_inv(p, 0, size)) 57 goto out; 58 59 p->last_op = IORING_OP_LAST - 1; 60 61 for (i = 0; i < nr_args; i++) { 62 p->ops[i].op = i; 63 if (io_uring_op_supported(i)) 64 p->ops[i].flags = IO_URING_OP_SUPPORTED; 65 } 66 p->ops_len = i; 67 68 ret = 0; 69 if (copy_to_user(arg, p, size)) 70 ret = -EFAULT; 71 out: 72 kfree(p); 73 return ret; 74 } 75 76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 77 { 78 const struct cred *creds; 79 80 creds = xa_erase(&ctx->personalities, id); 81 if (creds) { 82 put_cred(creds); 83 return 0; 84 } 85 86 return -EINVAL; 87 } 88 89 90 static int io_register_personality(struct io_ring_ctx *ctx) 91 { 92 const struct cred *creds; 93 u32 id; 94 int ret; 95 96 creds = get_current_cred(); 97 98 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 99 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 100 if (ret < 0) { 101 put_cred(creds); 102 return ret; 103 } 104 return id; 105 } 106 107 /* 108 * Returns number of restrictions parsed and added on success, or < 0 for 109 * an error. 110 */ 111 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, 112 struct io_restriction *restrictions) 113 { 114 struct io_uring_restriction *res; 115 size_t size; 116 int i, ret; 117 118 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 119 return -EINVAL; 120 121 size = array_size(nr_args, sizeof(*res)); 122 if (size == SIZE_MAX) 123 return -EOVERFLOW; 124 125 res = memdup_user(arg, size); 126 if (IS_ERR(res)) 127 return PTR_ERR(res); 128 129 ret = -EINVAL; 130 131 for (i = 0; i < nr_args; i++) { 132 switch (res[i].opcode) { 133 case IORING_RESTRICTION_REGISTER_OP: 134 if (res[i].register_op >= IORING_REGISTER_LAST) 135 goto err; 136 __set_bit(res[i].register_op, restrictions->register_op); 137 restrictions->reg_registered = true; 138 break; 139 case IORING_RESTRICTION_SQE_OP: 140 if (res[i].sqe_op >= IORING_OP_LAST) 141 goto err; 142 __set_bit(res[i].sqe_op, restrictions->sqe_op); 143 restrictions->op_registered = true; 144 break; 145 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 146 restrictions->sqe_flags_allowed = res[i].sqe_flags; 147 restrictions->op_registered = true; 148 break; 149 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 150 restrictions->sqe_flags_required = res[i].sqe_flags; 151 restrictions->op_registered = true; 152 break; 153 default: 154 goto err; 155 } 156 } 157 ret = nr_args; 158 if (!nr_args) { 159 restrictions->op_registered = true; 160 restrictions->reg_registered = true; 161 } 162 err: 163 kfree(res); 164 return ret; 165 } 166 167 static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 168 void __user *arg, unsigned int nr_args) 169 { 170 int ret; 171 172 /* Restrictions allowed only if rings started disabled */ 173 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 174 return -EBADFD; 175 176 /* We allow only a single restrictions registration */ 177 if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered) 178 return -EBUSY; 179 180 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); 181 /* 182 * Reset all restrictions if an error happened, but retain any COW'ed 183 * settings. 184 */ 185 if (ret < 0) { 186 struct io_bpf_filters *bpf = ctx->restrictions.bpf_filters; 187 bool cowed = ctx->restrictions.bpf_filters_cow; 188 189 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 190 ctx->restrictions.bpf_filters = bpf; 191 ctx->restrictions.bpf_filters_cow = cowed; 192 return ret; 193 } 194 if (ctx->restrictions.op_registered) 195 ctx->int_flags |= IO_RING_F_OP_RESTRICTED; 196 if (ctx->restrictions.reg_registered) 197 ctx->int_flags |= IO_RING_F_REG_RESTRICTED; 198 return 0; 199 } 200 201 static int io_register_restrictions_task(void __user *arg, unsigned int nr_args) 202 { 203 struct io_uring_task_restriction __user *ures = arg; 204 struct io_uring_task_restriction tres; 205 struct io_restriction *res; 206 int ret; 207 208 /* Disallow if task already has registered restrictions */ 209 if (current->io_uring_restrict) 210 return -EPERM; 211 /* 212 * Similar to seccomp, disallow setting a filter if task_no_new_privs 213 * is false and we're not CAP_SYS_ADMIN. 214 */ 215 if (!task_no_new_privs(current) && 216 !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) 217 return -EACCES; 218 if (nr_args != 1) 219 return -EINVAL; 220 221 if (copy_from_user(&tres, arg, sizeof(tres))) 222 return -EFAULT; 223 224 if (tres.flags) 225 return -EINVAL; 226 if (!mem_is_zero(tres.resv, sizeof(tres.resv))) 227 return -EINVAL; 228 229 res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT); 230 if (!res) 231 return -ENOMEM; 232 233 ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res); 234 if (ret < 0) { 235 kfree(res); 236 return ret; 237 } 238 current->io_uring_restrict = res; 239 return 0; 240 } 241 242 static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args) 243 { 244 struct io_restriction *res; 245 int ret; 246 247 /* 248 * Similar to seccomp, disallow setting a filter if task_no_new_privs 249 * is false and we're not CAP_SYS_ADMIN. 250 */ 251 if (!task_no_new_privs(current) && 252 !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) 253 return -EACCES; 254 255 if (nr_args != 1) 256 return -EINVAL; 257 258 /* If no task restrictions exist, setup a new set */ 259 res = current->io_uring_restrict; 260 if (!res) { 261 res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT); 262 if (!res) 263 return -ENOMEM; 264 } 265 266 ret = io_register_bpf_filter(res, arg); 267 if (ret) { 268 if (res != current->io_uring_restrict) 269 kfree(res); 270 return ret; 271 } 272 if (!current->io_uring_restrict) 273 current->io_uring_restrict = res; 274 return 0; 275 } 276 277 static int io_register_enable_rings(struct io_ring_ctx *ctx) 278 { 279 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 280 return -EBADFD; 281 282 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) { 283 ctx->submitter_task = get_task_struct(current); 284 /* 285 * Lazy activation attempts would fail if it was polled before 286 * submitter_task is set. 287 */ 288 if (wq_has_sleeper(&ctx->poll_wq)) 289 io_activate_pollwq(ctx); 290 } 291 292 /* Keep submitter_task store before clearing IORING_SETUP_R_DISABLED */ 293 smp_store_release(&ctx->flags, ctx->flags & ~IORING_SETUP_R_DISABLED); 294 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 295 wake_up(&ctx->sq_data->wait); 296 return 0; 297 } 298 299 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 300 cpumask_var_t new_mask) 301 { 302 int ret; 303 304 if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 305 ret = io_wq_cpu_affinity(current->io_uring, new_mask); 306 } else { 307 mutex_unlock(&ctx->uring_lock); 308 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 309 mutex_lock(&ctx->uring_lock); 310 } 311 312 return ret; 313 } 314 315 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 316 void __user *arg, unsigned len) 317 { 318 cpumask_var_t new_mask; 319 int ret; 320 321 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 322 return -ENOMEM; 323 324 cpumask_clear(new_mask); 325 if (len > cpumask_size()) 326 len = cpumask_size(); 327 328 #ifdef CONFIG_COMPAT 329 if (in_compat_syscall()) 330 ret = compat_get_bitmap(cpumask_bits(new_mask), 331 (const compat_ulong_t __user *)arg, 332 len * 8 /* CHAR_BIT */); 333 else 334 #endif 335 ret = copy_from_user(new_mask, arg, len); 336 337 if (ret) { 338 free_cpumask_var(new_mask); 339 return -EFAULT; 340 } 341 342 ret = __io_register_iowq_aff(ctx, new_mask); 343 free_cpumask_var(new_mask); 344 return ret; 345 } 346 347 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 348 { 349 return __io_register_iowq_aff(ctx, NULL); 350 } 351 352 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 353 void __user *arg) 354 __must_hold(&ctx->uring_lock) 355 { 356 struct io_tctx_node *node; 357 struct io_uring_task *tctx = NULL; 358 struct io_sq_data *sqd = NULL; 359 __u32 new_count[2]; 360 int i, ret; 361 362 if (copy_from_user(new_count, arg, sizeof(new_count))) 363 return -EFAULT; 364 for (i = 0; i < ARRAY_SIZE(new_count); i++) 365 if (new_count[i] > INT_MAX) 366 return -EINVAL; 367 368 if (ctx->flags & IORING_SETUP_SQPOLL) { 369 sqd = ctx->sq_data; 370 if (sqd) { 371 struct task_struct *tsk; 372 373 /* 374 * Observe the correct sqd->lock -> ctx->uring_lock 375 * ordering. Fine to drop uring_lock here, we hold 376 * a ref to the ctx. 377 */ 378 refcount_inc(&sqd->refs); 379 mutex_unlock(&ctx->uring_lock); 380 mutex_lock(&sqd->lock); 381 mutex_lock(&ctx->uring_lock); 382 tsk = sqpoll_task_locked(sqd); 383 if (tsk) 384 tctx = tsk->io_uring; 385 } 386 } else { 387 tctx = current->io_uring; 388 } 389 390 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 391 392 for (i = 0; i < ARRAY_SIZE(new_count); i++) 393 if (new_count[i]) 394 ctx->iowq_limits[i] = new_count[i]; 395 ctx->int_flags |= IO_RING_F_IOWQ_LIMITS_SET; 396 397 if (tctx && tctx->io_wq) { 398 ret = io_wq_max_workers(tctx->io_wq, new_count); 399 if (ret) 400 goto err; 401 } else { 402 memset(new_count, 0, sizeof(new_count)); 403 } 404 405 if (sqd) { 406 mutex_unlock(&ctx->uring_lock); 407 mutex_unlock(&sqd->lock); 408 io_put_sq_data(sqd); 409 mutex_lock(&ctx->uring_lock); 410 } 411 412 if (copy_to_user(arg, new_count, sizeof(new_count))) 413 return -EFAULT; 414 415 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 416 if (sqd) 417 return 0; 418 419 /* now propagate the restriction to all registered users */ 420 mutex_lock(&ctx->tctx_lock); 421 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 422 tctx = node->task->io_uring; 423 if (WARN_ON_ONCE(!tctx->io_wq)) 424 continue; 425 426 for (i = 0; i < ARRAY_SIZE(new_count); i++) 427 new_count[i] = ctx->iowq_limits[i]; 428 /* ignore errors, it always returns zero anyway */ 429 (void)io_wq_max_workers(tctx->io_wq, new_count); 430 } 431 mutex_unlock(&ctx->tctx_lock); 432 return 0; 433 err: 434 if (sqd) { 435 mutex_unlock(&ctx->uring_lock); 436 mutex_unlock(&sqd->lock); 437 io_put_sq_data(sqd); 438 mutex_lock(&ctx->uring_lock); 439 } 440 return ret; 441 } 442 443 static int io_register_clock(struct io_ring_ctx *ctx, 444 struct io_uring_clock_register __user *arg) 445 { 446 struct io_uring_clock_register reg; 447 448 if (copy_from_user(®, arg, sizeof(reg))) 449 return -EFAULT; 450 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) 451 return -EINVAL; 452 453 switch (reg.clockid) { 454 case CLOCK_MONOTONIC: 455 ctx->clock_offset = 0; 456 break; 457 case CLOCK_BOOTTIME: 458 ctx->clock_offset = TK_OFFS_BOOT; 459 break; 460 default: 461 return -EINVAL; 462 } 463 464 ctx->clockid = reg.clockid; 465 return 0; 466 } 467 468 /* 469 * State to maintain until we can swap. Both new and old state, used for 470 * either mapping or freeing. 471 */ 472 struct io_ring_ctx_rings { 473 struct io_rings *rings; 474 struct io_uring_sqe *sq_sqes; 475 476 struct io_mapped_region sq_region; 477 struct io_mapped_region ring_region; 478 }; 479 480 static void io_register_free_rings(struct io_ring_ctx *ctx, 481 struct io_ring_ctx_rings *r) 482 { 483 io_free_region(ctx->user, &r->sq_region); 484 io_free_region(ctx->user, &r->ring_region); 485 } 486 487 #define swap_old(ctx, o, n, field) \ 488 do { \ 489 (o).field = (ctx)->field; \ 490 (ctx)->field = (n).field; \ 491 } while (0) 492 493 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 494 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 495 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ 496 IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED) 497 498 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 499 { 500 struct io_ctx_config config; 501 struct io_uring_region_desc rd; 502 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; 503 unsigned i, tail, old_head; 504 struct io_uring_params *p = &config.p; 505 struct io_rings_layout *rl = &config.layout; 506 u32 *o_sq_array, *n_sq_array = NULL; 507 int ret; 508 509 memset(&config, 0, sizeof(config)); 510 511 /* limited to DEFER_TASKRUN for now */ 512 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 513 return -EINVAL; 514 if (copy_from_user(p, arg, sizeof(*p))) 515 return -EFAULT; 516 if (p->flags & ~RESIZE_FLAGS) 517 return -EINVAL; 518 519 /* properties that are always inherited */ 520 p->flags |= (ctx->flags & COPY_FLAGS); 521 522 ret = io_prepare_config(&config); 523 if (unlikely(ret)) 524 return ret; 525 526 memset(&rd, 0, sizeof(rd)); 527 rd.size = PAGE_ALIGN(rl->rings_size); 528 if (p->flags & IORING_SETUP_NO_MMAP) { 529 rd.user_addr = p->cq_off.user_addr; 530 rd.flags |= IORING_MEM_REGION_TYPE_USER; 531 } 532 ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); 533 if (ret) 534 return ret; 535 536 n.rings = io_region_get_ptr(&n.ring_region); 537 538 /* 539 * At this point n.rings is shared with userspace, just like o.rings 540 * is as well. While we don't expect userspace to modify it while 541 * a resize is in progress, and it's most likely that userspace will 542 * shoot itself in the foot if it does, we can't always assume good 543 * intent... Use read/write once helpers from here on to indicate the 544 * shared nature of it. 545 */ 546 WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1); 547 WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1); 548 WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries); 549 WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries); 550 551 if (copy_to_user(arg, p, sizeof(*p))) { 552 io_register_free_rings(ctx, &n); 553 return -EFAULT; 554 } 555 556 memset(&rd, 0, sizeof(rd)); 557 rd.size = PAGE_ALIGN(rl->sq_size); 558 if (p->flags & IORING_SETUP_NO_MMAP) { 559 rd.user_addr = p->sq_off.user_addr; 560 rd.flags |= IORING_MEM_REGION_TYPE_USER; 561 } 562 ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES); 563 if (ret) { 564 io_register_free_rings(ctx, &n); 565 return ret; 566 } 567 n.sq_sqes = io_region_get_ptr(&n.sq_region); 568 569 /* 570 * If using SQPOLL, park the thread 571 */ 572 if (ctx->sq_data) { 573 mutex_unlock(&ctx->uring_lock); 574 io_sq_thread_park(ctx->sq_data); 575 mutex_lock(&ctx->uring_lock); 576 } 577 578 /* 579 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude 580 * any new mmap's on the ring fd. Clear out existing mappings to prevent 581 * mmap from seeing them, as we'll unmap them. Any attempt to mmap 582 * existing rings beyond this point will fail. Not that it could proceed 583 * at this point anyway, as the io_uring mmap side needs go grab the 584 * ctx->mmap_lock as well. Likewise, hold the completion lock over the 585 * duration of the actual swap. 586 */ 587 mutex_lock(&ctx->mmap_lock); 588 spin_lock(&ctx->completion_lock); 589 o.rings = ctx->rings; 590 ctx->rings = NULL; 591 o.sq_sqes = ctx->sq_sqes; 592 ctx->sq_sqes = NULL; 593 o_sq_array = ctx->sq_array; 594 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 595 n_sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset); 596 597 /* 598 * Now copy SQ and CQ entries, if any. If either of the destination 599 * rings can't hold what is already there, then fail the operation. 600 */ 601 tail = READ_ONCE(o.rings->sq.tail); 602 old_head = READ_ONCE(o.rings->sq.head); 603 if (tail - old_head > p->sq_entries) 604 goto overflow; 605 for (i = old_head; i < tail; i++) { 606 unsigned int dst, src; 607 size_t sq_size; 608 609 dst = i & (p->sq_entries - 1); 610 src = i & (ctx->sq_entries - 1); 611 if (n_sq_array) { 612 src = READ_ONCE(o_sq_array[src]); 613 if (unlikely(src >= ctx->sq_entries)) { 614 WRITE_ONCE(n_sq_array[dst], UINT_MAX); 615 continue; 616 } 617 WRITE_ONCE(n_sq_array[dst], dst); 618 } 619 620 sq_size = sizeof(struct io_uring_sqe); 621 if (ctx->flags & IORING_SETUP_SQE128) { 622 dst <<= 1; 623 src <<= 1; 624 sq_size <<= 1; 625 } 626 memcpy(&n.sq_sqes[dst], &o.sq_sqes[src], sq_size); 627 } 628 WRITE_ONCE(n.rings->sq.head, old_head); 629 WRITE_ONCE(n.rings->sq.tail, tail); 630 631 tail = READ_ONCE(o.rings->cq.tail); 632 old_head = READ_ONCE(o.rings->cq.head); 633 if (tail - old_head > p->cq_entries) { 634 overflow: 635 /* restore old rings, and return -EOVERFLOW via cleanup path */ 636 ctx->rings = o.rings; 637 ctx->sq_sqes = o.sq_sqes; 638 to_free = &n; 639 ret = -EOVERFLOW; 640 goto out; 641 } 642 for (i = old_head; i < tail; i++) { 643 unsigned index, dst_mask, src_mask; 644 size_t cq_size; 645 646 index = i; 647 cq_size = sizeof(struct io_uring_cqe); 648 src_mask = ctx->cq_entries - 1; 649 dst_mask = p->cq_entries - 1; 650 if (ctx->flags & IORING_SETUP_CQE32) { 651 index <<= 1; 652 cq_size <<= 1; 653 src_mask = (ctx->cq_entries << 1) - 1; 654 dst_mask = (p->cq_entries << 1) - 1; 655 } 656 memcpy(&n.rings->cqes[index & dst_mask], &o.rings->cqes[index & src_mask], cq_size); 657 } 658 WRITE_ONCE(n.rings->cq.head, old_head); 659 WRITE_ONCE(n.rings->cq.tail, tail); 660 /* invalidate cached cqe refill */ 661 ctx->cqe_cached = ctx->cqe_sentinel = NULL; 662 663 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped)); 664 atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags)); 665 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags)); 666 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); 667 668 /* all done, store old pointers and assign new ones */ 669 if (n_sq_array) 670 ctx->sq_array = n_sq_array; 671 672 ctx->sq_entries = p->sq_entries; 673 ctx->cq_entries = p->cq_entries; 674 675 /* 676 * Just mark any flag we may have missed and that the application 677 * should act on unconditionally. Worst case it'll be an extra 678 * syscall. 679 */ 680 atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags); 681 ctx->rings = n.rings; 682 rcu_assign_pointer(ctx->rings_rcu, n.rings); 683 684 ctx->sq_sqes = n.sq_sqes; 685 swap_old(ctx, o, n, ring_region); 686 swap_old(ctx, o, n, sq_region); 687 to_free = &o; 688 ret = 0; 689 out: 690 spin_unlock(&ctx->completion_lock); 691 mutex_unlock(&ctx->mmap_lock); 692 /* Wait for concurrent io_ctx_mark_taskrun() */ 693 if (to_free == &o) 694 synchronize_rcu_expedited(); 695 io_register_free_rings(ctx, to_free); 696 697 if (ctx->sq_data) 698 io_sq_thread_unpark(ctx->sq_data); 699 700 return ret; 701 } 702 703 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) 704 { 705 struct io_uring_mem_region_reg __user *reg_uptr = uarg; 706 struct io_uring_mem_region_reg reg; 707 struct io_uring_region_desc __user *rd_uptr; 708 struct io_uring_region_desc rd; 709 struct io_mapped_region region = {}; 710 int ret; 711 712 if (io_region_is_set(&ctx->param_region)) 713 return -EBUSY; 714 if (copy_from_user(®, reg_uptr, sizeof(reg))) 715 return -EFAULT; 716 rd_uptr = u64_to_user_ptr(reg.region_uptr); 717 if (copy_from_user(&rd, rd_uptr, sizeof(rd))) 718 return -EFAULT; 719 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) 720 return -EINVAL; 721 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) 722 return -EINVAL; 723 724 /* 725 * This ensures there are no waiters. Waiters are unlocked and it's 726 * hard to synchronise with them, especially if we need to initialise 727 * the region. 728 */ 729 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) && 730 !(ctx->flags & IORING_SETUP_R_DISABLED)) 731 return -EINVAL; 732 733 ret = io_create_region(ctx, ®ion, &rd, IORING_MAP_OFF_PARAM_REGION); 734 if (ret) 735 return ret; 736 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { 737 io_free_region(ctx->user, ®ion); 738 return -EFAULT; 739 } 740 741 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { 742 ctx->cq_wait_arg = io_region_get_ptr(®ion); 743 ctx->cq_wait_size = rd.size; 744 } 745 746 io_region_publish(ctx, ®ion, &ctx->param_region); 747 return 0; 748 } 749 750 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 751 void __user *arg, unsigned nr_args) 752 __releases(ctx->uring_lock) 753 __acquires(ctx->uring_lock) 754 { 755 int ret; 756 757 /* 758 * We don't quiesce the refs for register anymore and so it can't be 759 * dying as we're holding a file ref here. 760 */ 761 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 762 return -ENXIO; 763 764 if (ctx->submitter_task && ctx->submitter_task != current) 765 return -EEXIST; 766 767 if ((ctx->int_flags & IO_RING_F_REG_RESTRICTED) && !(ctx->flags & IORING_SETUP_R_DISABLED)) { 768 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 769 if (!test_bit(opcode, ctx->restrictions.register_op)) 770 return -EACCES; 771 } 772 773 switch (opcode) { 774 case IORING_REGISTER_BUFFERS: 775 ret = -EFAULT; 776 if (!arg) 777 break; 778 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 779 break; 780 case IORING_UNREGISTER_BUFFERS: 781 ret = -EINVAL; 782 if (arg || nr_args) 783 break; 784 ret = io_sqe_buffers_unregister(ctx); 785 break; 786 case IORING_REGISTER_FILES: 787 ret = -EFAULT; 788 if (!arg) 789 break; 790 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 791 break; 792 case IORING_UNREGISTER_FILES: 793 ret = -EINVAL; 794 if (arg || nr_args) 795 break; 796 ret = io_sqe_files_unregister(ctx); 797 break; 798 case IORING_REGISTER_FILES_UPDATE: 799 ret = io_register_files_update(ctx, arg, nr_args); 800 break; 801 case IORING_REGISTER_EVENTFD: 802 ret = -EINVAL; 803 if (nr_args != 1) 804 break; 805 ret = io_eventfd_register(ctx, arg, 0); 806 break; 807 case IORING_REGISTER_EVENTFD_ASYNC: 808 ret = -EINVAL; 809 if (nr_args != 1) 810 break; 811 ret = io_eventfd_register(ctx, arg, 1); 812 break; 813 case IORING_UNREGISTER_EVENTFD: 814 ret = -EINVAL; 815 if (arg || nr_args) 816 break; 817 ret = io_eventfd_unregister(ctx); 818 break; 819 case IORING_REGISTER_PROBE: 820 ret = -EINVAL; 821 if (!arg || nr_args > 256) 822 break; 823 ret = io_probe(ctx, arg, nr_args); 824 break; 825 case IORING_REGISTER_PERSONALITY: 826 ret = -EINVAL; 827 if (arg || nr_args) 828 break; 829 ret = io_register_personality(ctx); 830 break; 831 case IORING_UNREGISTER_PERSONALITY: 832 ret = -EINVAL; 833 if (arg) 834 break; 835 ret = io_unregister_personality(ctx, nr_args); 836 break; 837 case IORING_REGISTER_ENABLE_RINGS: 838 ret = -EINVAL; 839 if (arg || nr_args) 840 break; 841 ret = io_register_enable_rings(ctx); 842 break; 843 case IORING_REGISTER_RESTRICTIONS: 844 ret = io_register_restrictions(ctx, arg, nr_args); 845 break; 846 case IORING_REGISTER_FILES2: 847 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 848 break; 849 case IORING_REGISTER_FILES_UPDATE2: 850 ret = io_register_rsrc_update(ctx, arg, nr_args, 851 IORING_RSRC_FILE); 852 break; 853 case IORING_REGISTER_BUFFERS2: 854 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 855 break; 856 case IORING_REGISTER_BUFFERS_UPDATE: 857 ret = io_register_rsrc_update(ctx, arg, nr_args, 858 IORING_RSRC_BUFFER); 859 break; 860 case IORING_REGISTER_IOWQ_AFF: 861 ret = -EINVAL; 862 if (!arg || !nr_args) 863 break; 864 ret = io_register_iowq_aff(ctx, arg, nr_args); 865 break; 866 case IORING_UNREGISTER_IOWQ_AFF: 867 ret = -EINVAL; 868 if (arg || nr_args) 869 break; 870 ret = io_unregister_iowq_aff(ctx); 871 break; 872 case IORING_REGISTER_IOWQ_MAX_WORKERS: 873 ret = -EINVAL; 874 if (!arg || nr_args != 2) 875 break; 876 ret = io_register_iowq_max_workers(ctx, arg); 877 break; 878 case IORING_REGISTER_RING_FDS: 879 ret = io_ringfd_register(ctx, arg, nr_args); 880 break; 881 case IORING_UNREGISTER_RING_FDS: 882 ret = io_ringfd_unregister(ctx, arg, nr_args); 883 break; 884 case IORING_REGISTER_PBUF_RING: 885 ret = -EINVAL; 886 if (!arg || nr_args != 1) 887 break; 888 ret = io_register_pbuf_ring(ctx, arg); 889 break; 890 case IORING_UNREGISTER_PBUF_RING: 891 ret = -EINVAL; 892 if (!arg || nr_args != 1) 893 break; 894 ret = io_unregister_pbuf_ring(ctx, arg); 895 break; 896 case IORING_REGISTER_SYNC_CANCEL: 897 ret = -EINVAL; 898 if (!arg || nr_args != 1) 899 break; 900 ret = io_sync_cancel(ctx, arg); 901 break; 902 case IORING_REGISTER_FILE_ALLOC_RANGE: 903 ret = -EINVAL; 904 if (!arg || nr_args) 905 break; 906 ret = io_register_file_alloc_range(ctx, arg); 907 break; 908 case IORING_REGISTER_PBUF_STATUS: 909 ret = -EINVAL; 910 if (!arg || nr_args != 1) 911 break; 912 ret = io_register_pbuf_status(ctx, arg); 913 break; 914 case IORING_REGISTER_NAPI: 915 ret = -EINVAL; 916 if (!arg || nr_args != 1) 917 break; 918 ret = io_register_napi(ctx, arg); 919 break; 920 case IORING_UNREGISTER_NAPI: 921 ret = -EINVAL; 922 if (nr_args != 1) 923 break; 924 ret = io_unregister_napi(ctx, arg); 925 break; 926 case IORING_REGISTER_CLOCK: 927 ret = -EINVAL; 928 if (!arg || nr_args) 929 break; 930 ret = io_register_clock(ctx, arg); 931 break; 932 case IORING_REGISTER_CLONE_BUFFERS: 933 ret = -EINVAL; 934 if (!arg || nr_args != 1) 935 break; 936 ret = io_register_clone_buffers(ctx, arg); 937 break; 938 case IORING_REGISTER_ZCRX_IFQ: 939 ret = -EINVAL; 940 if (!arg || nr_args != 1) 941 break; 942 ret = io_register_zcrx(ctx, arg); 943 break; 944 case IORING_REGISTER_RESIZE_RINGS: 945 ret = -EINVAL; 946 if (!arg || nr_args != 1) 947 break; 948 ret = io_register_resize_rings(ctx, arg); 949 break; 950 case IORING_REGISTER_MEM_REGION: 951 ret = -EINVAL; 952 if (!arg || nr_args != 1) 953 break; 954 ret = io_register_mem_region(ctx, arg); 955 break; 956 case IORING_REGISTER_QUERY: 957 ret = io_query(arg, nr_args); 958 break; 959 case IORING_REGISTER_ZCRX_CTRL: 960 ret = io_zcrx_ctrl(ctx, arg, nr_args); 961 break; 962 case IORING_REGISTER_BPF_FILTER: 963 ret = -EINVAL; 964 965 if (nr_args != 1) 966 break; 967 ret = io_register_bpf_filter(&ctx->restrictions, arg); 968 if (!ret) 969 WRITE_ONCE(ctx->bpf_filters, 970 ctx->restrictions.bpf_filters->filters); 971 break; 972 default: 973 ret = -EINVAL; 974 break; 975 } 976 977 return ret; 978 } 979 980 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args) 981 { 982 struct io_uring_sqe sqe; 983 984 if (!arg || nr_args != 1) 985 return -EINVAL; 986 if (copy_from_user(&sqe, arg, sizeof(sqe))) 987 return -EFAULT; 988 /* no flags supported */ 989 if (sqe.flags) 990 return -EINVAL; 991 if (sqe.opcode != IORING_OP_MSG_RING) 992 return -EINVAL; 993 994 return io_uring_sync_msg_ring(&sqe); 995 } 996 997 /* 998 * "blind" registration opcodes are ones where there's no ring given, and 999 * hence the source fd must be -1. 1000 */ 1001 static int io_uring_register_blind(unsigned int opcode, void __user *arg, 1002 unsigned int nr_args) 1003 { 1004 switch (opcode) { 1005 case IORING_REGISTER_SEND_MSG_RING: 1006 return io_uring_register_send_msg_ring(arg, nr_args); 1007 case IORING_REGISTER_QUERY: 1008 return io_query(arg, nr_args); 1009 case IORING_REGISTER_RESTRICTIONS: 1010 return io_register_restrictions_task(arg, nr_args); 1011 case IORING_REGISTER_BPF_FILTER: 1012 return io_register_bpf_filter_task(arg, nr_args); 1013 } 1014 return -EINVAL; 1015 } 1016 1017 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 1018 void __user *, arg, unsigned int, nr_args) 1019 { 1020 struct io_ring_ctx *ctx; 1021 long ret = -EBADF; 1022 struct file *file; 1023 bool use_registered_ring; 1024 1025 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 1026 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 1027 1028 if (opcode >= IORING_REGISTER_LAST) 1029 return -EINVAL; 1030 1031 if (fd == -1) 1032 return io_uring_register_blind(opcode, arg, nr_args); 1033 1034 file = io_uring_ctx_get_file(fd, use_registered_ring); 1035 if (IS_ERR(file)) 1036 return PTR_ERR(file); 1037 ctx = file->private_data; 1038 1039 mutex_lock(&ctx->uring_lock); 1040 ret = __io_uring_register(ctx, opcode, arg, nr_args); 1041 1042 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, 1043 ctx->buf_table.nr, ret); 1044 mutex_unlock(&ctx->uring_lock); 1045 1046 if (!use_registered_ring) 1047 fput(file); 1048 return ret; 1049 } 1050