1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code related to the io_uring_register() syscall 4 * 5 * Copyright (C) 2023 Jens Axboe 6 */ 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/syscalls.h> 10 #include <linux/refcount.h> 11 #include <linux/bits.h> 12 #include <linux/fs.h> 13 #include <linux/file.h> 14 #include <linux/slab.h> 15 #include <linux/uaccess.h> 16 #include <linux/nospec.h> 17 #include <linux/compat.h> 18 #include <linux/io_uring.h> 19 #include <linux/io_uring_types.h> 20 21 #include "io_uring.h" 22 #include "opdef.h" 23 #include "tctx.h" 24 #include "rsrc.h" 25 #include "sqpoll.h" 26 #include "register.h" 27 #include "cancel.h" 28 #include "kbuf.h" 29 #include "napi.h" 30 #include "eventfd.h" 31 32 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 33 IORING_REGISTER_LAST + IORING_OP_LAST) 34 35 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 36 unsigned nr_args) 37 { 38 struct io_uring_probe *p; 39 size_t size; 40 int i, ret; 41 42 if (nr_args > IORING_OP_LAST) 43 nr_args = IORING_OP_LAST; 44 45 size = struct_size(p, ops, nr_args); 46 p = kzalloc(size, GFP_KERNEL); 47 if (!p) 48 return -ENOMEM; 49 50 ret = -EFAULT; 51 if (copy_from_user(p, arg, size)) 52 goto out; 53 ret = -EINVAL; 54 if (memchr_inv(p, 0, size)) 55 goto out; 56 57 p->last_op = IORING_OP_LAST - 1; 58 59 for (i = 0; i < nr_args; i++) { 60 p->ops[i].op = i; 61 if (io_uring_op_supported(i)) 62 p->ops[i].flags = IO_URING_OP_SUPPORTED; 63 } 64 p->ops_len = i; 65 66 ret = 0; 67 if (copy_to_user(arg, p, size)) 68 ret = -EFAULT; 69 out: 70 kfree(p); 71 return ret; 72 } 73 74 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 75 { 76 const struct cred *creds; 77 78 creds = xa_erase(&ctx->personalities, id); 79 if (creds) { 80 put_cred(creds); 81 return 0; 82 } 83 84 return -EINVAL; 85 } 86 87 88 static int io_register_personality(struct io_ring_ctx *ctx) 89 { 90 const struct cred *creds; 91 u32 id; 92 int ret; 93 94 creds = get_current_cred(); 95 96 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 97 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 98 if (ret < 0) { 99 put_cred(creds); 100 return ret; 101 } 102 return id; 103 } 104 105 static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 106 void __user *arg, unsigned int nr_args) 107 { 108 struct io_uring_restriction *res; 109 size_t size; 110 int i, ret; 111 112 /* Restrictions allowed only if rings started disabled */ 113 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 114 return -EBADFD; 115 116 /* We allow only a single restrictions registration */ 117 if (ctx->restrictions.registered) 118 return -EBUSY; 119 120 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 121 return -EINVAL; 122 123 size = array_size(nr_args, sizeof(*res)); 124 if (size == SIZE_MAX) 125 return -EOVERFLOW; 126 127 res = memdup_user(arg, size); 128 if (IS_ERR(res)) 129 return PTR_ERR(res); 130 131 ret = 0; 132 133 for (i = 0; i < nr_args; i++) { 134 switch (res[i].opcode) { 135 case IORING_RESTRICTION_REGISTER_OP: 136 if (res[i].register_op >= IORING_REGISTER_LAST) { 137 ret = -EINVAL; 138 goto out; 139 } 140 141 __set_bit(res[i].register_op, 142 ctx->restrictions.register_op); 143 break; 144 case IORING_RESTRICTION_SQE_OP: 145 if (res[i].sqe_op >= IORING_OP_LAST) { 146 ret = -EINVAL; 147 goto out; 148 } 149 150 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 151 break; 152 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 153 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 154 break; 155 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 156 ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 157 break; 158 default: 159 ret = -EINVAL; 160 goto out; 161 } 162 } 163 164 out: 165 /* Reset all restrictions if an error happened */ 166 if (ret != 0) 167 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 168 else 169 ctx->restrictions.registered = true; 170 171 kfree(res); 172 return ret; 173 } 174 175 static int io_register_enable_rings(struct io_ring_ctx *ctx) 176 { 177 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 178 return -EBADFD; 179 180 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { 181 WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 182 /* 183 * Lazy activation attempts would fail if it was polled before 184 * submitter_task is set. 185 */ 186 if (wq_has_sleeper(&ctx->poll_wq)) 187 io_activate_pollwq(ctx); 188 } 189 190 if (ctx->restrictions.registered) 191 ctx->restricted = 1; 192 193 ctx->flags &= ~IORING_SETUP_R_DISABLED; 194 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 195 wake_up(&ctx->sq_data->wait); 196 return 0; 197 } 198 199 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 200 cpumask_var_t new_mask) 201 { 202 int ret; 203 204 if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 205 ret = io_wq_cpu_affinity(current->io_uring, new_mask); 206 } else { 207 mutex_unlock(&ctx->uring_lock); 208 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 209 mutex_lock(&ctx->uring_lock); 210 } 211 212 return ret; 213 } 214 215 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 216 void __user *arg, unsigned len) 217 { 218 cpumask_var_t new_mask; 219 int ret; 220 221 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 222 return -ENOMEM; 223 224 cpumask_clear(new_mask); 225 if (len > cpumask_size()) 226 len = cpumask_size(); 227 228 #ifdef CONFIG_COMPAT 229 if (in_compat_syscall()) 230 ret = compat_get_bitmap(cpumask_bits(new_mask), 231 (const compat_ulong_t __user *)arg, 232 len * 8 /* CHAR_BIT */); 233 else 234 #endif 235 ret = copy_from_user(new_mask, arg, len); 236 237 if (ret) { 238 free_cpumask_var(new_mask); 239 return -EFAULT; 240 } 241 242 ret = __io_register_iowq_aff(ctx, new_mask); 243 free_cpumask_var(new_mask); 244 return ret; 245 } 246 247 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 248 { 249 return __io_register_iowq_aff(ctx, NULL); 250 } 251 252 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 253 void __user *arg) 254 __must_hold(&ctx->uring_lock) 255 { 256 struct io_tctx_node *node; 257 struct io_uring_task *tctx = NULL; 258 struct io_sq_data *sqd = NULL; 259 __u32 new_count[2]; 260 int i, ret; 261 262 if (copy_from_user(new_count, arg, sizeof(new_count))) 263 return -EFAULT; 264 for (i = 0; i < ARRAY_SIZE(new_count); i++) 265 if (new_count[i] > INT_MAX) 266 return -EINVAL; 267 268 if (ctx->flags & IORING_SETUP_SQPOLL) { 269 sqd = ctx->sq_data; 270 if (sqd) { 271 /* 272 * Observe the correct sqd->lock -> ctx->uring_lock 273 * ordering. Fine to drop uring_lock here, we hold 274 * a ref to the ctx. 275 */ 276 refcount_inc(&sqd->refs); 277 mutex_unlock(&ctx->uring_lock); 278 mutex_lock(&sqd->lock); 279 mutex_lock(&ctx->uring_lock); 280 if (sqd->thread) 281 tctx = sqd->thread->io_uring; 282 } 283 } else { 284 tctx = current->io_uring; 285 } 286 287 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 288 289 for (i = 0; i < ARRAY_SIZE(new_count); i++) 290 if (new_count[i]) 291 ctx->iowq_limits[i] = new_count[i]; 292 ctx->iowq_limits_set = true; 293 294 if (tctx && tctx->io_wq) { 295 ret = io_wq_max_workers(tctx->io_wq, new_count); 296 if (ret) 297 goto err; 298 } else { 299 memset(new_count, 0, sizeof(new_count)); 300 } 301 302 if (sqd) { 303 mutex_unlock(&ctx->uring_lock); 304 mutex_unlock(&sqd->lock); 305 io_put_sq_data(sqd); 306 mutex_lock(&ctx->uring_lock); 307 } 308 309 if (copy_to_user(arg, new_count, sizeof(new_count))) 310 return -EFAULT; 311 312 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 313 if (sqd) 314 return 0; 315 316 /* now propagate the restriction to all registered users */ 317 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 318 tctx = node->task->io_uring; 319 if (WARN_ON_ONCE(!tctx->io_wq)) 320 continue; 321 322 for (i = 0; i < ARRAY_SIZE(new_count); i++) 323 new_count[i] = ctx->iowq_limits[i]; 324 /* ignore errors, it always returns zero anyway */ 325 (void)io_wq_max_workers(tctx->io_wq, new_count); 326 } 327 return 0; 328 err: 329 if (sqd) { 330 mutex_unlock(&ctx->uring_lock); 331 mutex_unlock(&sqd->lock); 332 io_put_sq_data(sqd); 333 mutex_lock(&ctx->uring_lock); 334 } 335 return ret; 336 } 337 338 static int io_register_clock(struct io_ring_ctx *ctx, 339 struct io_uring_clock_register __user *arg) 340 { 341 struct io_uring_clock_register reg; 342 343 if (copy_from_user(®, arg, sizeof(reg))) 344 return -EFAULT; 345 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) 346 return -EINVAL; 347 348 switch (reg.clockid) { 349 case CLOCK_MONOTONIC: 350 ctx->clock_offset = 0; 351 break; 352 case CLOCK_BOOTTIME: 353 ctx->clock_offset = TK_OFFS_BOOT; 354 break; 355 default: 356 return -EINVAL; 357 } 358 359 ctx->clockid = reg.clockid; 360 return 0; 361 } 362 363 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 364 void __user *arg, unsigned nr_args) 365 __releases(ctx->uring_lock) 366 __acquires(ctx->uring_lock) 367 { 368 int ret; 369 370 /* 371 * We don't quiesce the refs for register anymore and so it can't be 372 * dying as we're holding a file ref here. 373 */ 374 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 375 return -ENXIO; 376 377 if (ctx->submitter_task && ctx->submitter_task != current) 378 return -EEXIST; 379 380 if (ctx->restricted) { 381 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 382 if (!test_bit(opcode, ctx->restrictions.register_op)) 383 return -EACCES; 384 } 385 386 switch (opcode) { 387 case IORING_REGISTER_BUFFERS: 388 ret = -EFAULT; 389 if (!arg) 390 break; 391 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 392 break; 393 case IORING_UNREGISTER_BUFFERS: 394 ret = -EINVAL; 395 if (arg || nr_args) 396 break; 397 ret = io_sqe_buffers_unregister(ctx); 398 break; 399 case IORING_REGISTER_FILES: 400 ret = -EFAULT; 401 if (!arg) 402 break; 403 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 404 break; 405 case IORING_UNREGISTER_FILES: 406 ret = -EINVAL; 407 if (arg || nr_args) 408 break; 409 ret = io_sqe_files_unregister(ctx); 410 break; 411 case IORING_REGISTER_FILES_UPDATE: 412 ret = io_register_files_update(ctx, arg, nr_args); 413 break; 414 case IORING_REGISTER_EVENTFD: 415 ret = -EINVAL; 416 if (nr_args != 1) 417 break; 418 ret = io_eventfd_register(ctx, arg, 0); 419 break; 420 case IORING_REGISTER_EVENTFD_ASYNC: 421 ret = -EINVAL; 422 if (nr_args != 1) 423 break; 424 ret = io_eventfd_register(ctx, arg, 1); 425 break; 426 case IORING_UNREGISTER_EVENTFD: 427 ret = -EINVAL; 428 if (arg || nr_args) 429 break; 430 ret = io_eventfd_unregister(ctx); 431 break; 432 case IORING_REGISTER_PROBE: 433 ret = -EINVAL; 434 if (!arg || nr_args > 256) 435 break; 436 ret = io_probe(ctx, arg, nr_args); 437 break; 438 case IORING_REGISTER_PERSONALITY: 439 ret = -EINVAL; 440 if (arg || nr_args) 441 break; 442 ret = io_register_personality(ctx); 443 break; 444 case IORING_UNREGISTER_PERSONALITY: 445 ret = -EINVAL; 446 if (arg) 447 break; 448 ret = io_unregister_personality(ctx, nr_args); 449 break; 450 case IORING_REGISTER_ENABLE_RINGS: 451 ret = -EINVAL; 452 if (arg || nr_args) 453 break; 454 ret = io_register_enable_rings(ctx); 455 break; 456 case IORING_REGISTER_RESTRICTIONS: 457 ret = io_register_restrictions(ctx, arg, nr_args); 458 break; 459 case IORING_REGISTER_FILES2: 460 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 461 break; 462 case IORING_REGISTER_FILES_UPDATE2: 463 ret = io_register_rsrc_update(ctx, arg, nr_args, 464 IORING_RSRC_FILE); 465 break; 466 case IORING_REGISTER_BUFFERS2: 467 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 468 break; 469 case IORING_REGISTER_BUFFERS_UPDATE: 470 ret = io_register_rsrc_update(ctx, arg, nr_args, 471 IORING_RSRC_BUFFER); 472 break; 473 case IORING_REGISTER_IOWQ_AFF: 474 ret = -EINVAL; 475 if (!arg || !nr_args) 476 break; 477 ret = io_register_iowq_aff(ctx, arg, nr_args); 478 break; 479 case IORING_UNREGISTER_IOWQ_AFF: 480 ret = -EINVAL; 481 if (arg || nr_args) 482 break; 483 ret = io_unregister_iowq_aff(ctx); 484 break; 485 case IORING_REGISTER_IOWQ_MAX_WORKERS: 486 ret = -EINVAL; 487 if (!arg || nr_args != 2) 488 break; 489 ret = io_register_iowq_max_workers(ctx, arg); 490 break; 491 case IORING_REGISTER_RING_FDS: 492 ret = io_ringfd_register(ctx, arg, nr_args); 493 break; 494 case IORING_UNREGISTER_RING_FDS: 495 ret = io_ringfd_unregister(ctx, arg, nr_args); 496 break; 497 case IORING_REGISTER_PBUF_RING: 498 ret = -EINVAL; 499 if (!arg || nr_args != 1) 500 break; 501 ret = io_register_pbuf_ring(ctx, arg); 502 break; 503 case IORING_UNREGISTER_PBUF_RING: 504 ret = -EINVAL; 505 if (!arg || nr_args != 1) 506 break; 507 ret = io_unregister_pbuf_ring(ctx, arg); 508 break; 509 case IORING_REGISTER_SYNC_CANCEL: 510 ret = -EINVAL; 511 if (!arg || nr_args != 1) 512 break; 513 ret = io_sync_cancel(ctx, arg); 514 break; 515 case IORING_REGISTER_FILE_ALLOC_RANGE: 516 ret = -EINVAL; 517 if (!arg || nr_args) 518 break; 519 ret = io_register_file_alloc_range(ctx, arg); 520 break; 521 case IORING_REGISTER_PBUF_STATUS: 522 ret = -EINVAL; 523 if (!arg || nr_args != 1) 524 break; 525 ret = io_register_pbuf_status(ctx, arg); 526 break; 527 case IORING_REGISTER_NAPI: 528 ret = -EINVAL; 529 if (!arg || nr_args != 1) 530 break; 531 ret = io_register_napi(ctx, arg); 532 break; 533 case IORING_UNREGISTER_NAPI: 534 ret = -EINVAL; 535 if (nr_args != 1) 536 break; 537 ret = io_unregister_napi(ctx, arg); 538 break; 539 case IORING_REGISTER_CLOCK: 540 ret = -EINVAL; 541 if (!arg || nr_args) 542 break; 543 ret = io_register_clock(ctx, arg); 544 break; 545 case IORING_REGISTER_CLONE_BUFFERS: 546 ret = -EINVAL; 547 if (!arg || nr_args != 1) 548 break; 549 ret = io_register_clone_buffers(ctx, arg); 550 break; 551 default: 552 ret = -EINVAL; 553 break; 554 } 555 556 return ret; 557 } 558 559 /* 560 * Given an 'fd' value, return the ctx associated with if. If 'registered' is 561 * true, then the registered index is used. Otherwise, the normal fd table. 562 * Caller must call fput() on the returned file, unless it's an ERR_PTR. 563 */ 564 struct file *io_uring_register_get_file(unsigned int fd, bool registered) 565 { 566 struct file *file; 567 568 if (registered) { 569 /* 570 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 571 * need only dereference our task private array to find it. 572 */ 573 struct io_uring_task *tctx = current->io_uring; 574 575 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 576 return ERR_PTR(-EINVAL); 577 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 578 file = tctx->registered_rings[fd]; 579 } else { 580 file = fget(fd); 581 } 582 583 if (unlikely(!file)) 584 return ERR_PTR(-EBADF); 585 if (io_is_uring_fops(file)) 586 return file; 587 fput(file); 588 return ERR_PTR(-EOPNOTSUPP); 589 } 590 591 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 592 void __user *, arg, unsigned int, nr_args) 593 { 594 struct io_ring_ctx *ctx; 595 long ret = -EBADF; 596 struct file *file; 597 bool use_registered_ring; 598 599 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 600 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 601 602 if (opcode >= IORING_REGISTER_LAST) 603 return -EINVAL; 604 605 file = io_uring_register_get_file(fd, use_registered_ring); 606 if (IS_ERR(file)) 607 return PTR_ERR(file); 608 ctx = file->private_data; 609 610 mutex_lock(&ctx->uring_lock); 611 ret = __io_uring_register(ctx, opcode, arg, nr_args); 612 mutex_unlock(&ctx->uring_lock); 613 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); 614 if (!use_registered_ring) 615 fput(file); 616 return ret; 617 } 618