1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code related to the io_uring_register() syscall 4 * 5 * Copyright (C) 2023 Jens Axboe 6 */ 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/syscalls.h> 10 #include <linux/refcount.h> 11 #include <linux/bits.h> 12 #include <linux/fs.h> 13 #include <linux/file.h> 14 #include <linux/slab.h> 15 #include <linux/uaccess.h> 16 #include <linux/nospec.h> 17 #include <linux/io_uring.h> 18 #include <linux/io_uring_types.h> 19 20 #include "io_uring.h" 21 #include "opdef.h" 22 #include "tctx.h" 23 #include "rsrc.h" 24 #include "sqpoll.h" 25 #include "register.h" 26 #include "cancel.h" 27 #include "kbuf.h" 28 29 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 30 IORING_REGISTER_LAST + IORING_OP_LAST) 31 32 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, 33 unsigned int eventfd_async) 34 { 35 struct io_ev_fd *ev_fd; 36 __s32 __user *fds = arg; 37 int fd; 38 39 ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 40 lockdep_is_held(&ctx->uring_lock)); 41 if (ev_fd) 42 return -EBUSY; 43 44 if (copy_from_user(&fd, fds, sizeof(*fds))) 45 return -EFAULT; 46 47 ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); 48 if (!ev_fd) 49 return -ENOMEM; 50 51 ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); 52 if (IS_ERR(ev_fd->cq_ev_fd)) { 53 int ret = PTR_ERR(ev_fd->cq_ev_fd); 54 kfree(ev_fd); 55 return ret; 56 } 57 58 spin_lock(&ctx->completion_lock); 59 ctx->evfd_last_cq_tail = ctx->cached_cq_tail; 60 spin_unlock(&ctx->completion_lock); 61 62 ev_fd->eventfd_async = eventfd_async; 63 ctx->has_evfd = true; 64 rcu_assign_pointer(ctx->io_ev_fd, ev_fd); 65 atomic_set(&ev_fd->refs, 1); 66 atomic_set(&ev_fd->ops, 0); 67 return 0; 68 } 69 70 int io_eventfd_unregister(struct io_ring_ctx *ctx) 71 { 72 struct io_ev_fd *ev_fd; 73 74 ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 75 lockdep_is_held(&ctx->uring_lock)); 76 if (ev_fd) { 77 ctx->has_evfd = false; 78 rcu_assign_pointer(ctx->io_ev_fd, NULL); 79 if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) 80 call_rcu(&ev_fd->rcu, io_eventfd_ops); 81 return 0; 82 } 83 84 return -ENXIO; 85 } 86 87 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 88 unsigned nr_args) 89 { 90 struct io_uring_probe *p; 91 size_t size; 92 int i, ret; 93 94 size = struct_size(p, ops, nr_args); 95 if (size == SIZE_MAX) 96 return -EOVERFLOW; 97 p = kzalloc(size, GFP_KERNEL); 98 if (!p) 99 return -ENOMEM; 100 101 ret = -EFAULT; 102 if (copy_from_user(p, arg, size)) 103 goto out; 104 ret = -EINVAL; 105 if (memchr_inv(p, 0, size)) 106 goto out; 107 108 p->last_op = IORING_OP_LAST - 1; 109 if (nr_args > IORING_OP_LAST) 110 nr_args = IORING_OP_LAST; 111 112 for (i = 0; i < nr_args; i++) { 113 p->ops[i].op = i; 114 if (!io_issue_defs[i].not_supported) 115 p->ops[i].flags = IO_URING_OP_SUPPORTED; 116 } 117 p->ops_len = i; 118 119 ret = 0; 120 if (copy_to_user(arg, p, size)) 121 ret = -EFAULT; 122 out: 123 kfree(p); 124 return ret; 125 } 126 127 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 128 { 129 const struct cred *creds; 130 131 creds = xa_erase(&ctx->personalities, id); 132 if (creds) { 133 put_cred(creds); 134 return 0; 135 } 136 137 return -EINVAL; 138 } 139 140 141 static int io_register_personality(struct io_ring_ctx *ctx) 142 { 143 const struct cred *creds; 144 u32 id; 145 int ret; 146 147 creds = get_current_cred(); 148 149 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 150 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 151 if (ret < 0) { 152 put_cred(creds); 153 return ret; 154 } 155 return id; 156 } 157 158 static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 159 void __user *arg, unsigned int nr_args) 160 { 161 struct io_uring_restriction *res; 162 size_t size; 163 int i, ret; 164 165 /* Restrictions allowed only if rings started disabled */ 166 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 167 return -EBADFD; 168 169 /* We allow only a single restrictions registration */ 170 if (ctx->restrictions.registered) 171 return -EBUSY; 172 173 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 174 return -EINVAL; 175 176 size = array_size(nr_args, sizeof(*res)); 177 if (size == SIZE_MAX) 178 return -EOVERFLOW; 179 180 res = memdup_user(arg, size); 181 if (IS_ERR(res)) 182 return PTR_ERR(res); 183 184 ret = 0; 185 186 for (i = 0; i < nr_args; i++) { 187 switch (res[i].opcode) { 188 case IORING_RESTRICTION_REGISTER_OP: 189 if (res[i].register_op >= IORING_REGISTER_LAST) { 190 ret = -EINVAL; 191 goto out; 192 } 193 194 __set_bit(res[i].register_op, 195 ctx->restrictions.register_op); 196 break; 197 case IORING_RESTRICTION_SQE_OP: 198 if (res[i].sqe_op >= IORING_OP_LAST) { 199 ret = -EINVAL; 200 goto out; 201 } 202 203 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 204 break; 205 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 206 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 207 break; 208 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 209 ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 210 break; 211 default: 212 ret = -EINVAL; 213 goto out; 214 } 215 } 216 217 out: 218 /* Reset all restrictions if an error happened */ 219 if (ret != 0) 220 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 221 else 222 ctx->restrictions.registered = true; 223 224 kfree(res); 225 return ret; 226 } 227 228 static int io_register_enable_rings(struct io_ring_ctx *ctx) 229 { 230 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 231 return -EBADFD; 232 233 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { 234 WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 235 /* 236 * Lazy activation attempts would fail if it was polled before 237 * submitter_task is set. 238 */ 239 if (wq_has_sleeper(&ctx->poll_wq)) 240 io_activate_pollwq(ctx); 241 } 242 243 if (ctx->restrictions.registered) 244 ctx->restricted = 1; 245 246 ctx->flags &= ~IORING_SETUP_R_DISABLED; 247 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 248 wake_up(&ctx->sq_data->wait); 249 return 0; 250 } 251 252 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 253 cpumask_var_t new_mask) 254 { 255 int ret; 256 257 if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 258 ret = io_wq_cpu_affinity(current->io_uring, new_mask); 259 } else { 260 mutex_unlock(&ctx->uring_lock); 261 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 262 mutex_lock(&ctx->uring_lock); 263 } 264 265 return ret; 266 } 267 268 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 269 void __user *arg, unsigned len) 270 { 271 cpumask_var_t new_mask; 272 int ret; 273 274 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 275 return -ENOMEM; 276 277 cpumask_clear(new_mask); 278 if (len > cpumask_size()) 279 len = cpumask_size(); 280 281 if (in_compat_syscall()) { 282 ret = compat_get_bitmap(cpumask_bits(new_mask), 283 (const compat_ulong_t __user *)arg, 284 len * 8 /* CHAR_BIT */); 285 } else { 286 ret = copy_from_user(new_mask, arg, len); 287 } 288 289 if (ret) { 290 free_cpumask_var(new_mask); 291 return -EFAULT; 292 } 293 294 ret = __io_register_iowq_aff(ctx, new_mask); 295 free_cpumask_var(new_mask); 296 return ret; 297 } 298 299 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 300 { 301 return __io_register_iowq_aff(ctx, NULL); 302 } 303 304 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 305 void __user *arg) 306 __must_hold(&ctx->uring_lock) 307 { 308 struct io_tctx_node *node; 309 struct io_uring_task *tctx = NULL; 310 struct io_sq_data *sqd = NULL; 311 __u32 new_count[2]; 312 int i, ret; 313 314 if (copy_from_user(new_count, arg, sizeof(new_count))) 315 return -EFAULT; 316 for (i = 0; i < ARRAY_SIZE(new_count); i++) 317 if (new_count[i] > INT_MAX) 318 return -EINVAL; 319 320 if (ctx->flags & IORING_SETUP_SQPOLL) { 321 sqd = ctx->sq_data; 322 if (sqd) { 323 /* 324 * Observe the correct sqd->lock -> ctx->uring_lock 325 * ordering. Fine to drop uring_lock here, we hold 326 * a ref to the ctx. 327 */ 328 refcount_inc(&sqd->refs); 329 mutex_unlock(&ctx->uring_lock); 330 mutex_lock(&sqd->lock); 331 mutex_lock(&ctx->uring_lock); 332 if (sqd->thread) 333 tctx = sqd->thread->io_uring; 334 } 335 } else { 336 tctx = current->io_uring; 337 } 338 339 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 340 341 for (i = 0; i < ARRAY_SIZE(new_count); i++) 342 if (new_count[i]) 343 ctx->iowq_limits[i] = new_count[i]; 344 ctx->iowq_limits_set = true; 345 346 if (tctx && tctx->io_wq) { 347 ret = io_wq_max_workers(tctx->io_wq, new_count); 348 if (ret) 349 goto err; 350 } else { 351 memset(new_count, 0, sizeof(new_count)); 352 } 353 354 if (sqd) { 355 mutex_unlock(&sqd->lock); 356 io_put_sq_data(sqd); 357 } 358 359 if (copy_to_user(arg, new_count, sizeof(new_count))) 360 return -EFAULT; 361 362 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 363 if (sqd) 364 return 0; 365 366 /* now propagate the restriction to all registered users */ 367 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 368 struct io_uring_task *tctx = node->task->io_uring; 369 370 if (WARN_ON_ONCE(!tctx->io_wq)) 371 continue; 372 373 for (i = 0; i < ARRAY_SIZE(new_count); i++) 374 new_count[i] = ctx->iowq_limits[i]; 375 /* ignore errors, it always returns zero anyway */ 376 (void)io_wq_max_workers(tctx->io_wq, new_count); 377 } 378 return 0; 379 err: 380 if (sqd) { 381 mutex_unlock(&sqd->lock); 382 io_put_sq_data(sqd); 383 } 384 return ret; 385 } 386 387 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 388 void __user *arg, unsigned nr_args) 389 __releases(ctx->uring_lock) 390 __acquires(ctx->uring_lock) 391 { 392 int ret; 393 394 /* 395 * We don't quiesce the refs for register anymore and so it can't be 396 * dying as we're holding a file ref here. 397 */ 398 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 399 return -ENXIO; 400 401 if (ctx->submitter_task && ctx->submitter_task != current) 402 return -EEXIST; 403 404 if (ctx->restricted) { 405 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 406 if (!test_bit(opcode, ctx->restrictions.register_op)) 407 return -EACCES; 408 } 409 410 switch (opcode) { 411 case IORING_REGISTER_BUFFERS: 412 ret = -EFAULT; 413 if (!arg) 414 break; 415 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 416 break; 417 case IORING_UNREGISTER_BUFFERS: 418 ret = -EINVAL; 419 if (arg || nr_args) 420 break; 421 ret = io_sqe_buffers_unregister(ctx); 422 break; 423 case IORING_REGISTER_FILES: 424 ret = -EFAULT; 425 if (!arg) 426 break; 427 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 428 break; 429 case IORING_UNREGISTER_FILES: 430 ret = -EINVAL; 431 if (arg || nr_args) 432 break; 433 ret = io_sqe_files_unregister(ctx); 434 break; 435 case IORING_REGISTER_FILES_UPDATE: 436 ret = io_register_files_update(ctx, arg, nr_args); 437 break; 438 case IORING_REGISTER_EVENTFD: 439 ret = -EINVAL; 440 if (nr_args != 1) 441 break; 442 ret = io_eventfd_register(ctx, arg, 0); 443 break; 444 case IORING_REGISTER_EVENTFD_ASYNC: 445 ret = -EINVAL; 446 if (nr_args != 1) 447 break; 448 ret = io_eventfd_register(ctx, arg, 1); 449 break; 450 case IORING_UNREGISTER_EVENTFD: 451 ret = -EINVAL; 452 if (arg || nr_args) 453 break; 454 ret = io_eventfd_unregister(ctx); 455 break; 456 case IORING_REGISTER_PROBE: 457 ret = -EINVAL; 458 if (!arg || nr_args > 256) 459 break; 460 ret = io_probe(ctx, arg, nr_args); 461 break; 462 case IORING_REGISTER_PERSONALITY: 463 ret = -EINVAL; 464 if (arg || nr_args) 465 break; 466 ret = io_register_personality(ctx); 467 break; 468 case IORING_UNREGISTER_PERSONALITY: 469 ret = -EINVAL; 470 if (arg) 471 break; 472 ret = io_unregister_personality(ctx, nr_args); 473 break; 474 case IORING_REGISTER_ENABLE_RINGS: 475 ret = -EINVAL; 476 if (arg || nr_args) 477 break; 478 ret = io_register_enable_rings(ctx); 479 break; 480 case IORING_REGISTER_RESTRICTIONS: 481 ret = io_register_restrictions(ctx, arg, nr_args); 482 break; 483 case IORING_REGISTER_FILES2: 484 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 485 break; 486 case IORING_REGISTER_FILES_UPDATE2: 487 ret = io_register_rsrc_update(ctx, arg, nr_args, 488 IORING_RSRC_FILE); 489 break; 490 case IORING_REGISTER_BUFFERS2: 491 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 492 break; 493 case IORING_REGISTER_BUFFERS_UPDATE: 494 ret = io_register_rsrc_update(ctx, arg, nr_args, 495 IORING_RSRC_BUFFER); 496 break; 497 case IORING_REGISTER_IOWQ_AFF: 498 ret = -EINVAL; 499 if (!arg || !nr_args) 500 break; 501 ret = io_register_iowq_aff(ctx, arg, nr_args); 502 break; 503 case IORING_UNREGISTER_IOWQ_AFF: 504 ret = -EINVAL; 505 if (arg || nr_args) 506 break; 507 ret = io_unregister_iowq_aff(ctx); 508 break; 509 case IORING_REGISTER_IOWQ_MAX_WORKERS: 510 ret = -EINVAL; 511 if (!arg || nr_args != 2) 512 break; 513 ret = io_register_iowq_max_workers(ctx, arg); 514 break; 515 case IORING_REGISTER_RING_FDS: 516 ret = io_ringfd_register(ctx, arg, nr_args); 517 break; 518 case IORING_UNREGISTER_RING_FDS: 519 ret = io_ringfd_unregister(ctx, arg, nr_args); 520 break; 521 case IORING_REGISTER_PBUF_RING: 522 ret = -EINVAL; 523 if (!arg || nr_args != 1) 524 break; 525 ret = io_register_pbuf_ring(ctx, arg); 526 break; 527 case IORING_UNREGISTER_PBUF_RING: 528 ret = -EINVAL; 529 if (!arg || nr_args != 1) 530 break; 531 ret = io_unregister_pbuf_ring(ctx, arg); 532 break; 533 case IORING_REGISTER_SYNC_CANCEL: 534 ret = -EINVAL; 535 if (!arg || nr_args != 1) 536 break; 537 ret = io_sync_cancel(ctx, arg); 538 break; 539 case IORING_REGISTER_FILE_ALLOC_RANGE: 540 ret = -EINVAL; 541 if (!arg || nr_args) 542 break; 543 ret = io_register_file_alloc_range(ctx, arg); 544 break; 545 case IORING_REGISTER_PBUF_STATUS: 546 ret = -EINVAL; 547 if (!arg || nr_args != 1) 548 break; 549 ret = io_register_pbuf_status(ctx, arg); 550 break; 551 default: 552 ret = -EINVAL; 553 break; 554 } 555 556 return ret; 557 } 558 559 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 560 void __user *, arg, unsigned int, nr_args) 561 { 562 struct io_ring_ctx *ctx; 563 long ret = -EBADF; 564 struct file *file; 565 bool use_registered_ring; 566 567 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 568 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 569 570 if (opcode >= IORING_REGISTER_LAST) 571 return -EINVAL; 572 573 if (use_registered_ring) { 574 /* 575 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 576 * need only dereference our task private array to find it. 577 */ 578 struct io_uring_task *tctx = current->io_uring; 579 580 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 581 return -EINVAL; 582 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 583 file = tctx->registered_rings[fd]; 584 if (unlikely(!file)) 585 return -EBADF; 586 } else { 587 file = fget(fd); 588 if (unlikely(!file)) 589 return -EBADF; 590 ret = -EOPNOTSUPP; 591 if (!io_is_uring_fops(file)) 592 goto out_fput; 593 } 594 595 ctx = file->private_data; 596 597 mutex_lock(&ctx->uring_lock); 598 ret = __io_uring_register(ctx, opcode, arg, nr_args); 599 mutex_unlock(&ctx->uring_lock); 600 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); 601 out_fput: 602 if (!use_registered_ring) 603 fput(file); 604 return ret; 605 } 606