1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code related to the io_uring_register() syscall 4 * 5 * Copyright (C) 2023 Jens Axboe 6 */ 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/syscalls.h> 10 #include <linux/refcount.h> 11 #include <linux/bits.h> 12 #include <linux/fs.h> 13 #include <linux/file.h> 14 #include <linux/slab.h> 15 #include <linux/uaccess.h> 16 #include <linux/nospec.h> 17 #include <linux/compat.h> 18 #include <linux/io_uring.h> 19 #include <linux/io_uring_types.h> 20 21 #include "io_uring.h" 22 #include "opdef.h" 23 #include "tctx.h" 24 #include "rsrc.h" 25 #include "sqpoll.h" 26 #include "register.h" 27 #include "cancel.h" 28 #include "kbuf.h" 29 #include "napi.h" 30 31 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 32 IORING_REGISTER_LAST + IORING_OP_LAST) 33 34 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, 35 unsigned int eventfd_async) 36 { 37 struct io_ev_fd *ev_fd; 38 __s32 __user *fds = arg; 39 int fd; 40 41 ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 42 lockdep_is_held(&ctx->uring_lock)); 43 if (ev_fd) 44 return -EBUSY; 45 46 if (copy_from_user(&fd, fds, sizeof(*fds))) 47 return -EFAULT; 48 49 ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); 50 if (!ev_fd) 51 return -ENOMEM; 52 53 ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); 54 if (IS_ERR(ev_fd->cq_ev_fd)) { 55 int ret = PTR_ERR(ev_fd->cq_ev_fd); 56 kfree(ev_fd); 57 return ret; 58 } 59 60 spin_lock(&ctx->completion_lock); 61 ctx->evfd_last_cq_tail = ctx->cached_cq_tail; 62 spin_unlock(&ctx->completion_lock); 63 64 ev_fd->eventfd_async = eventfd_async; 65 ctx->has_evfd = true; 66 rcu_assign_pointer(ctx->io_ev_fd, ev_fd); 67 atomic_set(&ev_fd->refs, 1); 68 atomic_set(&ev_fd->ops, 0); 69 return 0; 70 } 71 72 int io_eventfd_unregister(struct io_ring_ctx *ctx) 73 { 74 struct io_ev_fd *ev_fd; 75 76 ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 77 lockdep_is_held(&ctx->uring_lock)); 78 if (ev_fd) { 79 ctx->has_evfd = false; 80 rcu_assign_pointer(ctx->io_ev_fd, NULL); 81 if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) 82 call_rcu(&ev_fd->rcu, io_eventfd_ops); 83 return 0; 84 } 85 86 return -ENXIO; 87 } 88 89 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 90 unsigned nr_args) 91 { 92 struct io_uring_probe *p; 93 size_t size; 94 int i, ret; 95 96 size = struct_size(p, ops, nr_args); 97 if (size == SIZE_MAX) 98 return -EOVERFLOW; 99 p = kzalloc(size, GFP_KERNEL); 100 if (!p) 101 return -ENOMEM; 102 103 ret = -EFAULT; 104 if (copy_from_user(p, arg, size)) 105 goto out; 106 ret = -EINVAL; 107 if (memchr_inv(p, 0, size)) 108 goto out; 109 110 p->last_op = IORING_OP_LAST - 1; 111 if (nr_args > IORING_OP_LAST) 112 nr_args = IORING_OP_LAST; 113 114 for (i = 0; i < nr_args; i++) { 115 p->ops[i].op = i; 116 if (!io_issue_defs[i].not_supported) 117 p->ops[i].flags = IO_URING_OP_SUPPORTED; 118 } 119 p->ops_len = i; 120 121 ret = 0; 122 if (copy_to_user(arg, p, size)) 123 ret = -EFAULT; 124 out: 125 kfree(p); 126 return ret; 127 } 128 129 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 130 { 131 const struct cred *creds; 132 133 creds = xa_erase(&ctx->personalities, id); 134 if (creds) { 135 put_cred(creds); 136 return 0; 137 } 138 139 return -EINVAL; 140 } 141 142 143 static int io_register_personality(struct io_ring_ctx *ctx) 144 { 145 const struct cred *creds; 146 u32 id; 147 int ret; 148 149 creds = get_current_cred(); 150 151 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 152 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 153 if (ret < 0) { 154 put_cred(creds); 155 return ret; 156 } 157 return id; 158 } 159 160 static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 161 void __user *arg, unsigned int nr_args) 162 { 163 struct io_uring_restriction *res; 164 size_t size; 165 int i, ret; 166 167 /* Restrictions allowed only if rings started disabled */ 168 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 169 return -EBADFD; 170 171 /* We allow only a single restrictions registration */ 172 if (ctx->restrictions.registered) 173 return -EBUSY; 174 175 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 176 return -EINVAL; 177 178 size = array_size(nr_args, sizeof(*res)); 179 if (size == SIZE_MAX) 180 return -EOVERFLOW; 181 182 res = memdup_user(arg, size); 183 if (IS_ERR(res)) 184 return PTR_ERR(res); 185 186 ret = 0; 187 188 for (i = 0; i < nr_args; i++) { 189 switch (res[i].opcode) { 190 case IORING_RESTRICTION_REGISTER_OP: 191 if (res[i].register_op >= IORING_REGISTER_LAST) { 192 ret = -EINVAL; 193 goto out; 194 } 195 196 __set_bit(res[i].register_op, 197 ctx->restrictions.register_op); 198 break; 199 case IORING_RESTRICTION_SQE_OP: 200 if (res[i].sqe_op >= IORING_OP_LAST) { 201 ret = -EINVAL; 202 goto out; 203 } 204 205 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 206 break; 207 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 208 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 209 break; 210 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 211 ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 212 break; 213 default: 214 ret = -EINVAL; 215 goto out; 216 } 217 } 218 219 out: 220 /* Reset all restrictions if an error happened */ 221 if (ret != 0) 222 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 223 else 224 ctx->restrictions.registered = true; 225 226 kfree(res); 227 return ret; 228 } 229 230 static int io_register_enable_rings(struct io_ring_ctx *ctx) 231 { 232 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 233 return -EBADFD; 234 235 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { 236 WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 237 /* 238 * Lazy activation attempts would fail if it was polled before 239 * submitter_task is set. 240 */ 241 if (wq_has_sleeper(&ctx->poll_wq)) 242 io_activate_pollwq(ctx); 243 } 244 245 if (ctx->restrictions.registered) 246 ctx->restricted = 1; 247 248 ctx->flags &= ~IORING_SETUP_R_DISABLED; 249 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 250 wake_up(&ctx->sq_data->wait); 251 return 0; 252 } 253 254 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 255 cpumask_var_t new_mask) 256 { 257 int ret; 258 259 if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 260 ret = io_wq_cpu_affinity(current->io_uring, new_mask); 261 } else { 262 mutex_unlock(&ctx->uring_lock); 263 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 264 mutex_lock(&ctx->uring_lock); 265 } 266 267 return ret; 268 } 269 270 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 271 void __user *arg, unsigned len) 272 { 273 cpumask_var_t new_mask; 274 int ret; 275 276 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 277 return -ENOMEM; 278 279 cpumask_clear(new_mask); 280 if (len > cpumask_size()) 281 len = cpumask_size(); 282 283 #ifdef CONFIG_COMPAT 284 if (in_compat_syscall()) 285 ret = compat_get_bitmap(cpumask_bits(new_mask), 286 (const compat_ulong_t __user *)arg, 287 len * 8 /* CHAR_BIT */); 288 else 289 #endif 290 ret = copy_from_user(new_mask, arg, len); 291 292 if (ret) { 293 free_cpumask_var(new_mask); 294 return -EFAULT; 295 } 296 297 ret = __io_register_iowq_aff(ctx, new_mask); 298 free_cpumask_var(new_mask); 299 return ret; 300 } 301 302 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 303 { 304 return __io_register_iowq_aff(ctx, NULL); 305 } 306 307 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 308 void __user *arg) 309 __must_hold(&ctx->uring_lock) 310 { 311 struct io_tctx_node *node; 312 struct io_uring_task *tctx = NULL; 313 struct io_sq_data *sqd = NULL; 314 __u32 new_count[2]; 315 int i, ret; 316 317 if (copy_from_user(new_count, arg, sizeof(new_count))) 318 return -EFAULT; 319 for (i = 0; i < ARRAY_SIZE(new_count); i++) 320 if (new_count[i] > INT_MAX) 321 return -EINVAL; 322 323 if (ctx->flags & IORING_SETUP_SQPOLL) { 324 sqd = ctx->sq_data; 325 if (sqd) { 326 /* 327 * Observe the correct sqd->lock -> ctx->uring_lock 328 * ordering. Fine to drop uring_lock here, we hold 329 * a ref to the ctx. 330 */ 331 refcount_inc(&sqd->refs); 332 mutex_unlock(&ctx->uring_lock); 333 mutex_lock(&sqd->lock); 334 mutex_lock(&ctx->uring_lock); 335 if (sqd->thread) 336 tctx = sqd->thread->io_uring; 337 } 338 } else { 339 tctx = current->io_uring; 340 } 341 342 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 343 344 for (i = 0; i < ARRAY_SIZE(new_count); i++) 345 if (new_count[i]) 346 ctx->iowq_limits[i] = new_count[i]; 347 ctx->iowq_limits_set = true; 348 349 if (tctx && tctx->io_wq) { 350 ret = io_wq_max_workers(tctx->io_wq, new_count); 351 if (ret) 352 goto err; 353 } else { 354 memset(new_count, 0, sizeof(new_count)); 355 } 356 357 if (sqd) { 358 mutex_unlock(&ctx->uring_lock); 359 mutex_unlock(&sqd->lock); 360 io_put_sq_data(sqd); 361 mutex_lock(&ctx->uring_lock); 362 } 363 364 if (copy_to_user(arg, new_count, sizeof(new_count))) 365 return -EFAULT; 366 367 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 368 if (sqd) 369 return 0; 370 371 /* now propagate the restriction to all registered users */ 372 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 373 tctx = node->task->io_uring; 374 if (WARN_ON_ONCE(!tctx->io_wq)) 375 continue; 376 377 for (i = 0; i < ARRAY_SIZE(new_count); i++) 378 new_count[i] = ctx->iowq_limits[i]; 379 /* ignore errors, it always returns zero anyway */ 380 (void)io_wq_max_workers(tctx->io_wq, new_count); 381 } 382 return 0; 383 err: 384 if (sqd) { 385 mutex_unlock(&ctx->uring_lock); 386 mutex_unlock(&sqd->lock); 387 io_put_sq_data(sqd); 388 mutex_lock(&ctx->uring_lock); 389 } 390 return ret; 391 } 392 393 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 394 void __user *arg, unsigned nr_args) 395 __releases(ctx->uring_lock) 396 __acquires(ctx->uring_lock) 397 { 398 int ret; 399 400 /* 401 * We don't quiesce the refs for register anymore and so it can't be 402 * dying as we're holding a file ref here. 403 */ 404 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 405 return -ENXIO; 406 407 if (ctx->submitter_task && ctx->submitter_task != current) 408 return -EEXIST; 409 410 if (ctx->restricted) { 411 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 412 if (!test_bit(opcode, ctx->restrictions.register_op)) 413 return -EACCES; 414 } 415 416 switch (opcode) { 417 case IORING_REGISTER_BUFFERS: 418 ret = -EFAULT; 419 if (!arg) 420 break; 421 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 422 break; 423 case IORING_UNREGISTER_BUFFERS: 424 ret = -EINVAL; 425 if (arg || nr_args) 426 break; 427 ret = io_sqe_buffers_unregister(ctx); 428 break; 429 case IORING_REGISTER_FILES: 430 ret = -EFAULT; 431 if (!arg) 432 break; 433 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 434 break; 435 case IORING_UNREGISTER_FILES: 436 ret = -EINVAL; 437 if (arg || nr_args) 438 break; 439 ret = io_sqe_files_unregister(ctx); 440 break; 441 case IORING_REGISTER_FILES_UPDATE: 442 ret = io_register_files_update(ctx, arg, nr_args); 443 break; 444 case IORING_REGISTER_EVENTFD: 445 ret = -EINVAL; 446 if (nr_args != 1) 447 break; 448 ret = io_eventfd_register(ctx, arg, 0); 449 break; 450 case IORING_REGISTER_EVENTFD_ASYNC: 451 ret = -EINVAL; 452 if (nr_args != 1) 453 break; 454 ret = io_eventfd_register(ctx, arg, 1); 455 break; 456 case IORING_UNREGISTER_EVENTFD: 457 ret = -EINVAL; 458 if (arg || nr_args) 459 break; 460 ret = io_eventfd_unregister(ctx); 461 break; 462 case IORING_REGISTER_PROBE: 463 ret = -EINVAL; 464 if (!arg || nr_args > 256) 465 break; 466 ret = io_probe(ctx, arg, nr_args); 467 break; 468 case IORING_REGISTER_PERSONALITY: 469 ret = -EINVAL; 470 if (arg || nr_args) 471 break; 472 ret = io_register_personality(ctx); 473 break; 474 case IORING_UNREGISTER_PERSONALITY: 475 ret = -EINVAL; 476 if (arg) 477 break; 478 ret = io_unregister_personality(ctx, nr_args); 479 break; 480 case IORING_REGISTER_ENABLE_RINGS: 481 ret = -EINVAL; 482 if (arg || nr_args) 483 break; 484 ret = io_register_enable_rings(ctx); 485 break; 486 case IORING_REGISTER_RESTRICTIONS: 487 ret = io_register_restrictions(ctx, arg, nr_args); 488 break; 489 case IORING_REGISTER_FILES2: 490 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 491 break; 492 case IORING_REGISTER_FILES_UPDATE2: 493 ret = io_register_rsrc_update(ctx, arg, nr_args, 494 IORING_RSRC_FILE); 495 break; 496 case IORING_REGISTER_BUFFERS2: 497 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 498 break; 499 case IORING_REGISTER_BUFFERS_UPDATE: 500 ret = io_register_rsrc_update(ctx, arg, nr_args, 501 IORING_RSRC_BUFFER); 502 break; 503 case IORING_REGISTER_IOWQ_AFF: 504 ret = -EINVAL; 505 if (!arg || !nr_args) 506 break; 507 ret = io_register_iowq_aff(ctx, arg, nr_args); 508 break; 509 case IORING_UNREGISTER_IOWQ_AFF: 510 ret = -EINVAL; 511 if (arg || nr_args) 512 break; 513 ret = io_unregister_iowq_aff(ctx); 514 break; 515 case IORING_REGISTER_IOWQ_MAX_WORKERS: 516 ret = -EINVAL; 517 if (!arg || nr_args != 2) 518 break; 519 ret = io_register_iowq_max_workers(ctx, arg); 520 break; 521 case IORING_REGISTER_RING_FDS: 522 ret = io_ringfd_register(ctx, arg, nr_args); 523 break; 524 case IORING_UNREGISTER_RING_FDS: 525 ret = io_ringfd_unregister(ctx, arg, nr_args); 526 break; 527 case IORING_REGISTER_PBUF_RING: 528 ret = -EINVAL; 529 if (!arg || nr_args != 1) 530 break; 531 ret = io_register_pbuf_ring(ctx, arg); 532 break; 533 case IORING_UNREGISTER_PBUF_RING: 534 ret = -EINVAL; 535 if (!arg || nr_args != 1) 536 break; 537 ret = io_unregister_pbuf_ring(ctx, arg); 538 break; 539 case IORING_REGISTER_SYNC_CANCEL: 540 ret = -EINVAL; 541 if (!arg || nr_args != 1) 542 break; 543 ret = io_sync_cancel(ctx, arg); 544 break; 545 case IORING_REGISTER_FILE_ALLOC_RANGE: 546 ret = -EINVAL; 547 if (!arg || nr_args) 548 break; 549 ret = io_register_file_alloc_range(ctx, arg); 550 break; 551 case IORING_REGISTER_PBUF_STATUS: 552 ret = -EINVAL; 553 if (!arg || nr_args != 1) 554 break; 555 ret = io_register_pbuf_status(ctx, arg); 556 break; 557 case IORING_REGISTER_NAPI: 558 ret = -EINVAL; 559 if (!arg || nr_args != 1) 560 break; 561 ret = io_register_napi(ctx, arg); 562 break; 563 case IORING_UNREGISTER_NAPI: 564 ret = -EINVAL; 565 if (nr_args != 1) 566 break; 567 ret = io_unregister_napi(ctx, arg); 568 break; 569 default: 570 ret = -EINVAL; 571 break; 572 } 573 574 return ret; 575 } 576 577 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 578 void __user *, arg, unsigned int, nr_args) 579 { 580 struct io_ring_ctx *ctx; 581 long ret = -EBADF; 582 struct file *file; 583 bool use_registered_ring; 584 585 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 586 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 587 588 if (opcode >= IORING_REGISTER_LAST) 589 return -EINVAL; 590 591 if (use_registered_ring) { 592 /* 593 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 594 * need only dereference our task private array to find it. 595 */ 596 struct io_uring_task *tctx = current->io_uring; 597 598 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 599 return -EINVAL; 600 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 601 file = tctx->registered_rings[fd]; 602 if (unlikely(!file)) 603 return -EBADF; 604 } else { 605 file = fget(fd); 606 if (unlikely(!file)) 607 return -EBADF; 608 ret = -EOPNOTSUPP; 609 if (!io_is_uring_fops(file)) 610 goto out_fput; 611 } 612 613 ctx = file->private_data; 614 615 mutex_lock(&ctx->uring_lock); 616 ret = __io_uring_register(ctx, opcode, arg, nr_args); 617 mutex_unlock(&ctx->uring_lock); 618 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); 619 out_fput: 620 if (!use_registered_ring) 621 fput(file); 622 return ret; 623 } 624