1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 13 #include <uapi/linux/io_uring.h> 14 15 #include "io_uring.h" 16 #include "alloc_cache.h" 17 #include "openclose.h" 18 #include "rsrc.h" 19 #include "memmap.h" 20 #include "register.h" 21 22 struct io_rsrc_update { 23 struct file *file; 24 u64 arg; 25 u32 nr_args; 26 u32 offset; 27 }; 28 29 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); 30 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 31 struct io_mapped_ubuf **pimu, 32 struct page **last_hpage); 33 34 /* only define max */ 35 #define IORING_MAX_FIXED_FILES (1U << 20) 36 #define IORING_MAX_REG_BUFFERS (1U << 14) 37 38 static const struct io_mapped_ubuf dummy_ubuf = { 39 /* set invalid range, so io_import_fixed() fails meeting it */ 40 .ubuf = -1UL, 41 .ubuf_end = 0, 42 }; 43 44 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 45 { 46 unsigned long page_limit, cur_pages, new_pages; 47 48 if (!nr_pages) 49 return 0; 50 51 /* Don't allow more pages than we can safely lock */ 52 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 53 54 cur_pages = atomic_long_read(&user->locked_vm); 55 do { 56 new_pages = cur_pages + nr_pages; 57 if (new_pages > page_limit) 58 return -ENOMEM; 59 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 60 &cur_pages, new_pages)); 61 return 0; 62 } 63 64 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 65 { 66 if (ctx->user) 67 __io_unaccount_mem(ctx->user, nr_pages); 68 69 if (ctx->mm_account) 70 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 71 } 72 73 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 74 { 75 int ret; 76 77 if (ctx->user) { 78 ret = __io_account_mem(ctx->user, nr_pages); 79 if (ret) 80 return ret; 81 } 82 83 if (ctx->mm_account) 84 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 85 86 return 0; 87 } 88 89 static int io_buffer_validate(struct iovec *iov) 90 { 91 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 92 93 /* 94 * Don't impose further limits on the size and buffer 95 * constraints here, we'll -EINVAL later when IO is 96 * submitted if they are wrong. 97 */ 98 if (!iov->iov_base) 99 return iov->iov_len ? -EFAULT : 0; 100 if (!iov->iov_len) 101 return -EFAULT; 102 103 /* arbitrary limit, but we need something */ 104 if (iov->iov_len > SZ_1G) 105 return -EFAULT; 106 107 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 108 return -EOVERFLOW; 109 110 return 0; 111 } 112 113 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 114 { 115 struct io_mapped_ubuf *imu = *slot; 116 unsigned int i; 117 118 *slot = NULL; 119 if (imu != &dummy_ubuf) { 120 if (!refcount_dec_and_test(&imu->refs)) 121 return; 122 for (i = 0; i < imu->nr_bvecs; i++) 123 unpin_user_page(imu->bvec[i].bv_page); 124 if (imu->acct_pages) 125 io_unaccount_mem(ctx, imu->acct_pages); 126 kvfree(imu); 127 } 128 } 129 130 static void io_rsrc_put_work(struct io_rsrc_node *node) 131 { 132 struct io_rsrc_put *prsrc = &node->item; 133 134 if (prsrc->tag) 135 io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0); 136 137 switch (node->type) { 138 case IORING_RSRC_FILE: 139 fput(prsrc->file); 140 break; 141 case IORING_RSRC_BUFFER: 142 io_rsrc_buf_put(node->ctx, prsrc); 143 break; 144 default: 145 WARN_ON_ONCE(1); 146 break; 147 } 148 } 149 150 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 151 { 152 if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node)) 153 kfree(node); 154 } 155 156 void io_rsrc_node_ref_zero(struct io_rsrc_node *node) 157 __must_hold(&node->ctx->uring_lock) 158 { 159 struct io_ring_ctx *ctx = node->ctx; 160 161 while (!list_empty(&ctx->rsrc_ref_list)) { 162 node = list_first_entry(&ctx->rsrc_ref_list, 163 struct io_rsrc_node, node); 164 /* recycle ref nodes in order */ 165 if (node->refs) 166 break; 167 list_del(&node->node); 168 169 if (likely(!node->empty)) 170 io_rsrc_put_work(node); 171 io_rsrc_node_destroy(ctx, node); 172 } 173 if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) 174 wake_up_all(&ctx->rsrc_quiesce_wq); 175 } 176 177 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) 178 { 179 struct io_rsrc_node *ref_node; 180 181 ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache); 182 if (!ref_node) { 183 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 184 if (!ref_node) 185 return NULL; 186 } 187 188 ref_node->ctx = ctx; 189 ref_node->empty = 0; 190 ref_node->refs = 1; 191 return ref_node; 192 } 193 194 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, 195 struct io_ring_ctx *ctx) 196 { 197 struct io_rsrc_node *backup; 198 DEFINE_WAIT(we); 199 int ret; 200 201 /* As We may drop ->uring_lock, other task may have started quiesce */ 202 if (data->quiesce) 203 return -ENXIO; 204 205 backup = io_rsrc_node_alloc(ctx); 206 if (!backup) 207 return -ENOMEM; 208 ctx->rsrc_node->empty = true; 209 ctx->rsrc_node->type = -1; 210 list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); 211 io_put_rsrc_node(ctx, ctx->rsrc_node); 212 ctx->rsrc_node = backup; 213 214 if (list_empty(&ctx->rsrc_ref_list)) 215 return 0; 216 217 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 218 atomic_set(&ctx->cq_wait_nr, 1); 219 smp_mb(); 220 } 221 222 ctx->rsrc_quiesce++; 223 data->quiesce = true; 224 do { 225 prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); 226 mutex_unlock(&ctx->uring_lock); 227 228 ret = io_run_task_work_sig(ctx); 229 if (ret < 0) { 230 finish_wait(&ctx->rsrc_quiesce_wq, &we); 231 mutex_lock(&ctx->uring_lock); 232 if (list_empty(&ctx->rsrc_ref_list)) 233 ret = 0; 234 break; 235 } 236 237 schedule(); 238 mutex_lock(&ctx->uring_lock); 239 ret = 0; 240 } while (!list_empty(&ctx->rsrc_ref_list)); 241 242 finish_wait(&ctx->rsrc_quiesce_wq, &we); 243 data->quiesce = false; 244 ctx->rsrc_quiesce--; 245 246 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 247 atomic_set(&ctx->cq_wait_nr, 0); 248 smp_mb(); 249 } 250 return ret; 251 } 252 253 static void io_free_page_table(void **table, size_t size) 254 { 255 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 256 257 for (i = 0; i < nr_tables; i++) 258 kfree(table[i]); 259 kfree(table); 260 } 261 262 static void io_rsrc_data_free(struct io_rsrc_data *data) 263 { 264 size_t size = data->nr * sizeof(data->tags[0][0]); 265 266 if (data->tags) 267 io_free_page_table((void **)data->tags, size); 268 kfree(data); 269 } 270 271 static __cold void **io_alloc_page_table(size_t size) 272 { 273 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 274 size_t init_size = size; 275 void **table; 276 277 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 278 if (!table) 279 return NULL; 280 281 for (i = 0; i < nr_tables; i++) { 282 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 283 284 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 285 if (!table[i]) { 286 io_free_page_table(table, init_size); 287 return NULL; 288 } 289 size -= this_size; 290 } 291 return table; 292 } 293 294 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, 295 u64 __user *utags, 296 unsigned nr, struct io_rsrc_data **pdata) 297 { 298 struct io_rsrc_data *data; 299 int ret = 0; 300 unsigned i; 301 302 data = kzalloc(sizeof(*data), GFP_KERNEL); 303 if (!data) 304 return -ENOMEM; 305 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 306 if (!data->tags) { 307 kfree(data); 308 return -ENOMEM; 309 } 310 311 data->nr = nr; 312 data->ctx = ctx; 313 data->rsrc_type = type; 314 if (utags) { 315 ret = -EFAULT; 316 for (i = 0; i < nr; i++) { 317 u64 *tag_slot = io_get_tag_slot(data, i); 318 319 if (copy_from_user(tag_slot, &utags[i], 320 sizeof(*tag_slot))) 321 goto fail; 322 } 323 } 324 *pdata = data; 325 return 0; 326 fail: 327 io_rsrc_data_free(data); 328 return ret; 329 } 330 331 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 332 struct io_uring_rsrc_update2 *up, 333 unsigned nr_args) 334 { 335 u64 __user *tags = u64_to_user_ptr(up->tags); 336 __s32 __user *fds = u64_to_user_ptr(up->data); 337 struct io_rsrc_data *data = ctx->file_data; 338 struct io_fixed_file *file_slot; 339 int fd, i, err = 0; 340 unsigned int done; 341 342 if (!ctx->file_data) 343 return -ENXIO; 344 if (up->offset + nr_args > ctx->nr_user_files) 345 return -EINVAL; 346 347 for (done = 0; done < nr_args; done++) { 348 u64 tag = 0; 349 350 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 351 copy_from_user(&fd, &fds[done], sizeof(fd))) { 352 err = -EFAULT; 353 break; 354 } 355 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 356 err = -EINVAL; 357 break; 358 } 359 if (fd == IORING_REGISTER_FILES_SKIP) 360 continue; 361 362 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 363 file_slot = io_fixed_file_slot(&ctx->file_table, i); 364 365 if (file_slot->file_ptr) { 366 err = io_queue_rsrc_removal(data, i, 367 io_slot_file(file_slot)); 368 if (err) 369 break; 370 file_slot->file_ptr = 0; 371 io_file_bitmap_clear(&ctx->file_table, i); 372 } 373 if (fd != -1) { 374 struct file *file = fget(fd); 375 376 if (!file) { 377 err = -EBADF; 378 break; 379 } 380 /* 381 * Don't allow io_uring instances to be registered. 382 */ 383 if (io_is_uring_fops(file)) { 384 fput(file); 385 err = -EBADF; 386 break; 387 } 388 *io_get_tag_slot(data, i) = tag; 389 io_fixed_file_set(file_slot, file); 390 io_file_bitmap_set(&ctx->file_table, i); 391 } 392 } 393 return done ? done : err; 394 } 395 396 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 397 struct io_uring_rsrc_update2 *up, 398 unsigned int nr_args) 399 { 400 u64 __user *tags = u64_to_user_ptr(up->tags); 401 struct iovec fast_iov, *iov; 402 struct page *last_hpage = NULL; 403 struct iovec __user *uvec; 404 u64 user_data = up->data; 405 __u32 done; 406 int i, err; 407 408 if (!ctx->buf_data) 409 return -ENXIO; 410 if (up->offset + nr_args > ctx->nr_user_bufs) 411 return -EINVAL; 412 413 for (done = 0; done < nr_args; done++) { 414 struct io_mapped_ubuf *imu; 415 u64 tag = 0; 416 417 uvec = u64_to_user_ptr(user_data); 418 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 419 if (IS_ERR(iov)) { 420 err = PTR_ERR(iov); 421 break; 422 } 423 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 424 err = -EFAULT; 425 break; 426 } 427 err = io_buffer_validate(iov); 428 if (err) 429 break; 430 if (!iov->iov_base && tag) { 431 err = -EINVAL; 432 break; 433 } 434 err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage); 435 if (err) 436 break; 437 438 i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); 439 if (ctx->user_bufs[i] != &dummy_ubuf) { 440 err = io_queue_rsrc_removal(ctx->buf_data, i, 441 ctx->user_bufs[i]); 442 if (unlikely(err)) { 443 io_buffer_unmap(ctx, &imu); 444 break; 445 } 446 ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; 447 } 448 449 ctx->user_bufs[i] = imu; 450 *io_get_tag_slot(ctx->buf_data, i) = tag; 451 if (ctx->compat) 452 user_data += sizeof(struct compat_iovec); 453 else 454 user_data += sizeof(struct iovec); 455 } 456 return done ? done : err; 457 } 458 459 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 460 struct io_uring_rsrc_update2 *up, 461 unsigned nr_args) 462 { 463 __u32 tmp; 464 465 lockdep_assert_held(&ctx->uring_lock); 466 467 if (check_add_overflow(up->offset, nr_args, &tmp)) 468 return -EOVERFLOW; 469 470 switch (type) { 471 case IORING_RSRC_FILE: 472 return __io_sqe_files_update(ctx, up, nr_args); 473 case IORING_RSRC_BUFFER: 474 return __io_sqe_buffers_update(ctx, up, nr_args); 475 } 476 return -EINVAL; 477 } 478 479 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 480 unsigned nr_args) 481 { 482 struct io_uring_rsrc_update2 up; 483 484 if (!nr_args) 485 return -EINVAL; 486 memset(&up, 0, sizeof(up)); 487 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 488 return -EFAULT; 489 if (up.resv || up.resv2) 490 return -EINVAL; 491 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 492 } 493 494 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 495 unsigned size, unsigned type) 496 { 497 struct io_uring_rsrc_update2 up; 498 499 if (size != sizeof(up)) 500 return -EINVAL; 501 if (copy_from_user(&up, arg, sizeof(up))) 502 return -EFAULT; 503 if (!up.nr || up.resv || up.resv2) 504 return -EINVAL; 505 return __io_register_rsrc_update(ctx, type, &up, up.nr); 506 } 507 508 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 509 unsigned int size, unsigned int type) 510 { 511 struct io_uring_rsrc_register rr; 512 513 /* keep it extendible */ 514 if (size != sizeof(rr)) 515 return -EINVAL; 516 517 memset(&rr, 0, sizeof(rr)); 518 if (copy_from_user(&rr, arg, size)) 519 return -EFAULT; 520 if (!rr.nr || rr.resv2) 521 return -EINVAL; 522 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 523 return -EINVAL; 524 525 switch (type) { 526 case IORING_RSRC_FILE: 527 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 528 break; 529 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 530 rr.nr, u64_to_user_ptr(rr.tags)); 531 case IORING_RSRC_BUFFER: 532 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 533 break; 534 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 535 rr.nr, u64_to_user_ptr(rr.tags)); 536 } 537 return -EINVAL; 538 } 539 540 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 541 { 542 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 543 544 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 545 return -EINVAL; 546 if (sqe->rw_flags || sqe->splice_fd_in) 547 return -EINVAL; 548 549 up->offset = READ_ONCE(sqe->off); 550 up->nr_args = READ_ONCE(sqe->len); 551 if (!up->nr_args) 552 return -EINVAL; 553 up->arg = READ_ONCE(sqe->addr); 554 return 0; 555 } 556 557 static int io_files_update_with_index_alloc(struct io_kiocb *req, 558 unsigned int issue_flags) 559 { 560 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 561 __s32 __user *fds = u64_to_user_ptr(up->arg); 562 unsigned int done; 563 struct file *file; 564 int ret, fd; 565 566 if (!req->ctx->file_data) 567 return -ENXIO; 568 569 for (done = 0; done < up->nr_args; done++) { 570 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 571 ret = -EFAULT; 572 break; 573 } 574 575 file = fget(fd); 576 if (!file) { 577 ret = -EBADF; 578 break; 579 } 580 ret = io_fixed_fd_install(req, issue_flags, file, 581 IORING_FILE_INDEX_ALLOC); 582 if (ret < 0) 583 break; 584 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 585 __io_close_fixed(req->ctx, issue_flags, ret); 586 ret = -EFAULT; 587 break; 588 } 589 } 590 591 if (done) 592 return done; 593 return ret; 594 } 595 596 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 597 { 598 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 599 struct io_ring_ctx *ctx = req->ctx; 600 struct io_uring_rsrc_update2 up2; 601 int ret; 602 603 up2.offset = up->offset; 604 up2.data = up->arg; 605 up2.nr = 0; 606 up2.tags = 0; 607 up2.resv = 0; 608 up2.resv2 = 0; 609 610 if (up->offset == IORING_FILE_INDEX_ALLOC) { 611 ret = io_files_update_with_index_alloc(req, issue_flags); 612 } else { 613 io_ring_submit_lock(ctx, issue_flags); 614 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 615 &up2, up->nr_args); 616 io_ring_submit_unlock(ctx, issue_flags); 617 } 618 619 if (ret < 0) 620 req_set_fail(req); 621 io_req_set_res(req, ret, 0); 622 return IOU_OK; 623 } 624 625 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) 626 { 627 struct io_ring_ctx *ctx = data->ctx; 628 struct io_rsrc_node *node = ctx->rsrc_node; 629 u64 *tag_slot = io_get_tag_slot(data, idx); 630 631 ctx->rsrc_node = io_rsrc_node_alloc(ctx); 632 if (unlikely(!ctx->rsrc_node)) { 633 ctx->rsrc_node = node; 634 return -ENOMEM; 635 } 636 637 node->item.rsrc = rsrc; 638 node->type = data->rsrc_type; 639 node->item.tag = *tag_slot; 640 *tag_slot = 0; 641 list_add_tail(&node->node, &ctx->rsrc_ref_list); 642 io_put_rsrc_node(ctx, node); 643 return 0; 644 } 645 646 void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 647 { 648 int i; 649 650 for (i = 0; i < ctx->nr_user_files; i++) { 651 struct file *file = io_file_from_index(&ctx->file_table, i); 652 653 if (!file) 654 continue; 655 io_file_bitmap_clear(&ctx->file_table, i); 656 fput(file); 657 } 658 659 io_free_file_tables(&ctx->file_table); 660 io_file_table_set_alloc_range(ctx, 0, 0); 661 io_rsrc_data_free(ctx->file_data); 662 ctx->file_data = NULL; 663 ctx->nr_user_files = 0; 664 } 665 666 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 667 { 668 unsigned nr = ctx->nr_user_files; 669 int ret; 670 671 if (!ctx->file_data) 672 return -ENXIO; 673 674 /* 675 * Quiesce may unlock ->uring_lock, and while it's not held 676 * prevent new requests using the table. 677 */ 678 ctx->nr_user_files = 0; 679 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 680 ctx->nr_user_files = nr; 681 if (!ret) 682 __io_sqe_files_unregister(ctx); 683 return ret; 684 } 685 686 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 687 unsigned nr_args, u64 __user *tags) 688 { 689 __s32 __user *fds = (__s32 __user *) arg; 690 struct file *file; 691 int fd, ret; 692 unsigned i; 693 694 if (ctx->file_data) 695 return -EBUSY; 696 if (!nr_args) 697 return -EINVAL; 698 if (nr_args > IORING_MAX_FIXED_FILES) 699 return -EMFILE; 700 if (nr_args > rlimit(RLIMIT_NOFILE)) 701 return -EMFILE; 702 ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args, 703 &ctx->file_data); 704 if (ret) 705 return ret; 706 707 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { 708 io_rsrc_data_free(ctx->file_data); 709 ctx->file_data = NULL; 710 return -ENOMEM; 711 } 712 713 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 714 struct io_fixed_file *file_slot; 715 716 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { 717 ret = -EFAULT; 718 goto fail; 719 } 720 /* allow sparse sets */ 721 if (!fds || fd == -1) { 722 ret = -EINVAL; 723 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 724 goto fail; 725 continue; 726 } 727 728 file = fget(fd); 729 ret = -EBADF; 730 if (unlikely(!file)) 731 goto fail; 732 733 /* 734 * Don't allow io_uring instances to be registered. 735 */ 736 if (io_is_uring_fops(file)) { 737 fput(file); 738 goto fail; 739 } 740 file_slot = io_fixed_file_slot(&ctx->file_table, i); 741 io_fixed_file_set(file_slot, file); 742 io_file_bitmap_set(&ctx->file_table, i); 743 } 744 745 /* default it to the whole table */ 746 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); 747 return 0; 748 fail: 749 __io_sqe_files_unregister(ctx); 750 return ret; 751 } 752 753 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 754 { 755 io_buffer_unmap(ctx, &prsrc->buf); 756 prsrc->buf = NULL; 757 } 758 759 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 760 { 761 unsigned int i; 762 763 for (i = 0; i < ctx->nr_user_bufs; i++) 764 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 765 kfree(ctx->user_bufs); 766 io_rsrc_data_free(ctx->buf_data); 767 ctx->user_bufs = NULL; 768 ctx->buf_data = NULL; 769 ctx->nr_user_bufs = 0; 770 } 771 772 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 773 { 774 unsigned nr = ctx->nr_user_bufs; 775 int ret; 776 777 if (!ctx->buf_data) 778 return -ENXIO; 779 780 /* 781 * Quiesce may unlock ->uring_lock, and while it's not held 782 * prevent new requests using the table. 783 */ 784 ctx->nr_user_bufs = 0; 785 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 786 ctx->nr_user_bufs = nr; 787 if (!ret) 788 __io_sqe_buffers_unregister(ctx); 789 return ret; 790 } 791 792 /* 793 * Not super efficient, but this is just a registration time. And we do cache 794 * the last compound head, so generally we'll only do a full search if we don't 795 * match that one. 796 * 797 * We check if the given compound head page has already been accounted, to 798 * avoid double accounting it. This allows us to account the full size of the 799 * page, not just the constituent pages of a huge page. 800 */ 801 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 802 int nr_pages, struct page *hpage) 803 { 804 int i, j; 805 806 /* check current page array */ 807 for (i = 0; i < nr_pages; i++) { 808 if (!PageCompound(pages[i])) 809 continue; 810 if (compound_head(pages[i]) == hpage) 811 return true; 812 } 813 814 /* check previously registered pages */ 815 for (i = 0; i < ctx->nr_user_bufs; i++) { 816 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 817 818 for (j = 0; j < imu->nr_bvecs; j++) { 819 if (!PageCompound(imu->bvec[j].bv_page)) 820 continue; 821 if (compound_head(imu->bvec[j].bv_page) == hpage) 822 return true; 823 } 824 } 825 826 return false; 827 } 828 829 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 830 int nr_pages, struct io_mapped_ubuf *imu, 831 struct page **last_hpage) 832 { 833 int i, ret; 834 835 imu->acct_pages = 0; 836 for (i = 0; i < nr_pages; i++) { 837 if (!PageCompound(pages[i])) { 838 imu->acct_pages++; 839 } else { 840 struct page *hpage; 841 842 hpage = compound_head(pages[i]); 843 if (hpage == *last_hpage) 844 continue; 845 *last_hpage = hpage; 846 if (headpage_already_acct(ctx, pages, i, hpage)) 847 continue; 848 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 849 } 850 } 851 852 if (!imu->acct_pages) 853 return 0; 854 855 ret = io_account_mem(ctx, imu->acct_pages); 856 if (ret) 857 imu->acct_pages = 0; 858 return ret; 859 } 860 861 static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, 862 struct io_imu_folio_data *data, int nr_folios) 863 { 864 struct page **page_array = *pages, **new_array = NULL; 865 int nr_pages_left = *nr_pages, i, j; 866 867 /* Store head pages only*/ 868 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), 869 GFP_KERNEL); 870 if (!new_array) 871 return false; 872 873 new_array[0] = compound_head(page_array[0]); 874 /* 875 * The pages are bound to the folio, it doesn't 876 * actually unpin them but drops all but one reference, 877 * which is usually put down by io_buffer_unmap(). 878 * Note, needs a better helper. 879 */ 880 if (data->nr_pages_head > 1) 881 unpin_user_pages(&page_array[1], data->nr_pages_head - 1); 882 883 j = data->nr_pages_head; 884 nr_pages_left -= data->nr_pages_head; 885 for (i = 1; i < nr_folios; i++) { 886 unsigned int nr_unpin; 887 888 new_array[i] = page_array[j]; 889 nr_unpin = min_t(unsigned int, nr_pages_left - 1, 890 data->nr_pages_mid - 1); 891 if (nr_unpin) 892 unpin_user_pages(&page_array[j+1], nr_unpin); 893 j += data->nr_pages_mid; 894 nr_pages_left -= data->nr_pages_mid; 895 } 896 kvfree(page_array); 897 *pages = new_array; 898 *nr_pages = nr_folios; 899 return true; 900 } 901 902 static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, 903 struct io_imu_folio_data *data) 904 { 905 struct page **page_array = *pages; 906 struct folio *folio = page_folio(page_array[0]); 907 unsigned int count = 1, nr_folios = 1; 908 int i; 909 910 if (*nr_pages <= 1) 911 return false; 912 913 data->nr_pages_mid = folio_nr_pages(folio); 914 if (data->nr_pages_mid == 1) 915 return false; 916 917 data->folio_shift = folio_shift(folio); 918 /* 919 * Check if pages are contiguous inside a folio, and all folios have 920 * the same page count except for the head and tail. 921 */ 922 for (i = 1; i < *nr_pages; i++) { 923 if (page_folio(page_array[i]) == folio && 924 page_array[i] == page_array[i-1] + 1) { 925 count++; 926 continue; 927 } 928 929 if (nr_folios == 1) { 930 if (folio_page_idx(folio, page_array[i-1]) != 931 data->nr_pages_mid - 1) 932 return false; 933 934 data->nr_pages_head = count; 935 } else if (count != data->nr_pages_mid) { 936 return false; 937 } 938 939 folio = page_folio(page_array[i]); 940 if (folio_size(folio) != (1UL << data->folio_shift) || 941 folio_page_idx(folio, page_array[i]) != 0) 942 return false; 943 944 count = 1; 945 nr_folios++; 946 } 947 if (nr_folios == 1) 948 data->nr_pages_head = count; 949 950 return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); 951 } 952 953 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 954 struct io_mapped_ubuf **pimu, 955 struct page **last_hpage) 956 { 957 struct io_mapped_ubuf *imu = NULL; 958 struct page **pages = NULL; 959 unsigned long off; 960 size_t size; 961 int ret, nr_pages, i; 962 struct io_imu_folio_data data; 963 bool coalesced; 964 965 *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; 966 if (!iov->iov_base) 967 return 0; 968 969 ret = -ENOMEM; 970 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 971 &nr_pages); 972 if (IS_ERR(pages)) { 973 ret = PTR_ERR(pages); 974 pages = NULL; 975 goto done; 976 } 977 978 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 979 coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data); 980 981 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 982 if (!imu) 983 goto done; 984 985 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 986 if (ret) { 987 unpin_user_pages(pages, nr_pages); 988 goto done; 989 } 990 991 size = iov->iov_len; 992 /* store original address for later verification */ 993 imu->ubuf = (unsigned long) iov->iov_base; 994 imu->ubuf_end = imu->ubuf + iov->iov_len; 995 imu->nr_bvecs = nr_pages; 996 imu->folio_shift = PAGE_SHIFT; 997 imu->folio_mask = PAGE_MASK; 998 if (coalesced) { 999 imu->folio_shift = data.folio_shift; 1000 imu->folio_mask = ~((1UL << data.folio_shift) - 1); 1001 } 1002 refcount_set(&imu->refs, 1); 1003 off = (unsigned long) iov->iov_base & ~imu->folio_mask; 1004 *pimu = imu; 1005 ret = 0; 1006 1007 for (i = 0; i < nr_pages; i++) { 1008 size_t vec_len; 1009 1010 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 1011 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 1012 off = 0; 1013 size -= vec_len; 1014 } 1015 done: 1016 if (ret) 1017 kvfree(imu); 1018 kvfree(pages); 1019 return ret; 1020 } 1021 1022 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 1023 { 1024 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 1025 return ctx->user_bufs ? 0 : -ENOMEM; 1026 } 1027 1028 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 1029 unsigned int nr_args, u64 __user *tags) 1030 { 1031 struct page *last_hpage = NULL; 1032 struct io_rsrc_data *data; 1033 struct iovec fast_iov, *iov = &fast_iov; 1034 const struct iovec __user *uvec; 1035 int i, ret; 1036 1037 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 1038 1039 if (ctx->user_bufs) 1040 return -EBUSY; 1041 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 1042 return -EINVAL; 1043 ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data); 1044 if (ret) 1045 return ret; 1046 ret = io_buffers_map_alloc(ctx, nr_args); 1047 if (ret) { 1048 io_rsrc_data_free(data); 1049 return ret; 1050 } 1051 1052 if (!arg) 1053 memset(iov, 0, sizeof(*iov)); 1054 1055 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 1056 if (arg) { 1057 uvec = (struct iovec __user *) arg; 1058 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 1059 if (IS_ERR(iov)) { 1060 ret = PTR_ERR(iov); 1061 break; 1062 } 1063 ret = io_buffer_validate(iov); 1064 if (ret) 1065 break; 1066 if (ctx->compat) 1067 arg += sizeof(struct compat_iovec); 1068 else 1069 arg += sizeof(struct iovec); 1070 } 1071 1072 if (!iov->iov_base && *io_get_tag_slot(data, i)) { 1073 ret = -EINVAL; 1074 break; 1075 } 1076 1077 ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i], 1078 &last_hpage); 1079 if (ret) 1080 break; 1081 } 1082 1083 WARN_ON_ONCE(ctx->buf_data); 1084 1085 ctx->buf_data = data; 1086 if (ret) 1087 __io_sqe_buffers_unregister(ctx); 1088 return ret; 1089 } 1090 1091 int io_import_fixed(int ddir, struct iov_iter *iter, 1092 struct io_mapped_ubuf *imu, 1093 u64 buf_addr, size_t len) 1094 { 1095 u64 buf_end; 1096 size_t offset; 1097 1098 if (WARN_ON_ONCE(!imu)) 1099 return -EFAULT; 1100 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1101 return -EFAULT; 1102 /* not inside the mapped region */ 1103 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 1104 return -EFAULT; 1105 1106 /* 1107 * Might not be a start of buffer, set size appropriately 1108 * and advance us to the beginning. 1109 */ 1110 offset = buf_addr - imu->ubuf; 1111 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 1112 1113 if (offset) { 1114 /* 1115 * Don't use iov_iter_advance() here, as it's really slow for 1116 * using the latter parts of a big fixed buffer - it iterates 1117 * over each segment manually. We can cheat a bit here, because 1118 * we know that: 1119 * 1120 * 1) it's a BVEC iter, we set it up 1121 * 2) all bvecs are the same in size, except potentially the 1122 * first and last bvec 1123 * 1124 * So just find our index, and adjust the iterator afterwards. 1125 * If the offset is within the first bvec (or the whole first 1126 * bvec, just use iov_iter_advance(). This makes it easier 1127 * since we can just skip the first segment, which may not 1128 * be folio_size aligned. 1129 */ 1130 const struct bio_vec *bvec = imu->bvec; 1131 1132 if (offset < bvec->bv_len) { 1133 iter->bvec = bvec; 1134 iter->count -= offset; 1135 iter->iov_offset = offset; 1136 } else { 1137 unsigned long seg_skip; 1138 1139 /* skip first vec */ 1140 offset -= bvec->bv_len; 1141 seg_skip = 1 + (offset >> imu->folio_shift); 1142 1143 iter->bvec = bvec + seg_skip; 1144 iter->nr_segs -= seg_skip; 1145 iter->count -= bvec->bv_len + offset; 1146 iter->iov_offset = offset & ~imu->folio_mask; 1147 } 1148 } 1149 1150 return 0; 1151 } 1152 1153 static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) 1154 { 1155 struct io_mapped_ubuf **user_bufs; 1156 struct io_rsrc_data *data; 1157 int i, ret, nbufs; 1158 1159 /* 1160 * Drop our own lock here. We'll setup the data we need and reference 1161 * the source buffers, then re-grab, check, and assign at the end. 1162 */ 1163 mutex_unlock(&ctx->uring_lock); 1164 1165 mutex_lock(&src_ctx->uring_lock); 1166 ret = -ENXIO; 1167 nbufs = src_ctx->nr_user_bufs; 1168 if (!nbufs) 1169 goto out_unlock; 1170 ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data); 1171 if (ret) 1172 goto out_unlock; 1173 1174 ret = -ENOMEM; 1175 user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL); 1176 if (!user_bufs) 1177 goto out_free_data; 1178 1179 for (i = 0; i < nbufs; i++) { 1180 struct io_mapped_ubuf *src = src_ctx->user_bufs[i]; 1181 1182 refcount_inc(&src->refs); 1183 user_bufs[i] = src; 1184 } 1185 1186 /* Have a ref on the bufs now, drop src lock and re-grab our own lock */ 1187 mutex_unlock(&src_ctx->uring_lock); 1188 mutex_lock(&ctx->uring_lock); 1189 if (!ctx->user_bufs) { 1190 ctx->user_bufs = user_bufs; 1191 ctx->buf_data = data; 1192 ctx->nr_user_bufs = nbufs; 1193 return 0; 1194 } 1195 1196 /* someone raced setting up buffers, dump ours */ 1197 for (i = 0; i < nbufs; i++) 1198 io_buffer_unmap(ctx, &user_bufs[i]); 1199 io_rsrc_data_free(data); 1200 kfree(user_bufs); 1201 return -EBUSY; 1202 out_free_data: 1203 io_rsrc_data_free(data); 1204 out_unlock: 1205 mutex_unlock(&src_ctx->uring_lock); 1206 mutex_lock(&ctx->uring_lock); 1207 return ret; 1208 } 1209 1210 /* 1211 * Copy the registered buffers from the source ring whose file descriptor 1212 * is given in the src_fd to the current ring. This is identical to registering 1213 * the buffers with ctx, except faster as mappings already exist. 1214 * 1215 * Since the memory is already accounted once, don't account it again. 1216 */ 1217 int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg) 1218 { 1219 struct io_uring_copy_buffers buf; 1220 bool registered_src; 1221 struct file *file; 1222 int ret; 1223 1224 if (ctx->user_bufs || ctx->nr_user_bufs) 1225 return -EBUSY; 1226 if (copy_from_user(&buf, arg, sizeof(buf))) 1227 return -EFAULT; 1228 if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED) 1229 return -EINVAL; 1230 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1231 return -EINVAL; 1232 1233 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1234 file = io_uring_register_get_file(buf.src_fd, registered_src); 1235 if (IS_ERR(file)) 1236 return PTR_ERR(file); 1237 ret = io_copy_buffers(ctx, file->private_data); 1238 if (!registered_src) 1239 fput(file); 1240 return ret; 1241 } 1242