1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 13 #include <uapi/linux/io_uring.h> 14 15 #include "io_uring.h" 16 #include "openclose.h" 17 #include "rsrc.h" 18 #include "memmap.h" 19 #include "register.h" 20 21 struct io_rsrc_update { 22 struct file *file; 23 u64 arg; 24 u32 nr_args; 25 u32 offset; 26 }; 27 28 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 29 struct iovec *iov, struct page **last_hpage); 30 31 /* only define max */ 32 #define IORING_MAX_FIXED_FILES (1U << 20) 33 #define IORING_MAX_REG_BUFFERS (1U << 14) 34 35 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 36 { 37 unsigned long page_limit, cur_pages, new_pages; 38 39 if (!nr_pages) 40 return 0; 41 42 /* Don't allow more pages than we can safely lock */ 43 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 44 45 cur_pages = atomic_long_read(&user->locked_vm); 46 do { 47 new_pages = cur_pages + nr_pages; 48 if (new_pages > page_limit) 49 return -ENOMEM; 50 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 51 &cur_pages, new_pages)); 52 return 0; 53 } 54 55 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 56 { 57 if (ctx->user) 58 __io_unaccount_mem(ctx->user, nr_pages); 59 60 if (ctx->mm_account) 61 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 62 } 63 64 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 65 { 66 int ret; 67 68 if (ctx->user) { 69 ret = __io_account_mem(ctx->user, nr_pages); 70 if (ret) 71 return ret; 72 } 73 74 if (ctx->mm_account) 75 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 76 77 return 0; 78 } 79 80 static int io_buffer_validate(struct iovec *iov) 81 { 82 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 83 84 /* 85 * Don't impose further limits on the size and buffer 86 * constraints here, we'll -EINVAL later when IO is 87 * submitted if they are wrong. 88 */ 89 if (!iov->iov_base) 90 return iov->iov_len ? -EFAULT : 0; 91 if (!iov->iov_len) 92 return -EFAULT; 93 94 /* arbitrary limit, but we need something */ 95 if (iov->iov_len > SZ_1G) 96 return -EFAULT; 97 98 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 99 return -EOVERFLOW; 100 101 return 0; 102 } 103 104 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 105 { 106 unsigned int i; 107 108 if (node->buf) { 109 struct io_mapped_ubuf *imu = node->buf; 110 111 if (!refcount_dec_and_test(&imu->refs)) 112 return; 113 for (i = 0; i < imu->nr_bvecs; i++) 114 unpin_user_page(imu->bvec[i].bv_page); 115 if (imu->acct_pages) 116 io_unaccount_mem(ctx, imu->acct_pages); 117 kvfree(imu); 118 } 119 } 120 121 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 122 { 123 struct io_rsrc_node *node; 124 125 node = kzalloc(sizeof(*node), GFP_KERNEL); 126 if (node) { 127 node->type = type; 128 node->refs = 1; 129 } 130 return node; 131 } 132 133 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) 134 { 135 if (!data->nr) 136 return; 137 while (data->nr--) { 138 if (data->nodes[data->nr]) 139 io_put_rsrc_node(ctx, data->nodes[data->nr]); 140 } 141 kvfree(data->nodes); 142 data->nodes = NULL; 143 data->nr = 0; 144 } 145 146 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 147 { 148 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), 149 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 150 if (data->nodes) { 151 data->nr = nr; 152 return 0; 153 } 154 return -ENOMEM; 155 } 156 157 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 158 struct io_uring_rsrc_update2 *up, 159 unsigned nr_args) 160 { 161 u64 __user *tags = u64_to_user_ptr(up->tags); 162 __s32 __user *fds = u64_to_user_ptr(up->data); 163 int fd, i, err = 0; 164 unsigned int done; 165 166 if (!ctx->file_table.data.nr) 167 return -ENXIO; 168 if (up->offset + nr_args > ctx->file_table.data.nr) 169 return -EINVAL; 170 171 for (done = 0; done < nr_args; done++) { 172 u64 tag = 0; 173 174 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 175 copy_from_user(&fd, &fds[done], sizeof(fd))) { 176 err = -EFAULT; 177 break; 178 } 179 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 180 err = -EINVAL; 181 break; 182 } 183 if (fd == IORING_REGISTER_FILES_SKIP) 184 continue; 185 186 i = up->offset + done; 187 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 188 io_file_bitmap_clear(&ctx->file_table, i); 189 190 if (fd != -1) { 191 struct file *file = fget(fd); 192 struct io_rsrc_node *node; 193 194 if (!file) { 195 err = -EBADF; 196 break; 197 } 198 /* 199 * Don't allow io_uring instances to be registered. 200 */ 201 if (io_is_uring_fops(file)) { 202 fput(file); 203 err = -EBADF; 204 break; 205 } 206 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 207 if (!node) { 208 err = -ENOMEM; 209 fput(file); 210 break; 211 } 212 ctx->file_table.data.nodes[i] = node; 213 if (tag) 214 node->tag = tag; 215 io_fixed_file_set(node, file); 216 io_file_bitmap_set(&ctx->file_table, i); 217 } 218 } 219 return done ? done : err; 220 } 221 222 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 223 struct io_uring_rsrc_update2 *up, 224 unsigned int nr_args) 225 { 226 u64 __user *tags = u64_to_user_ptr(up->tags); 227 struct iovec fast_iov, *iov; 228 struct page *last_hpage = NULL; 229 struct iovec __user *uvec; 230 u64 user_data = up->data; 231 __u32 done; 232 int i, err; 233 234 if (!ctx->buf_table.nr) 235 return -ENXIO; 236 if (up->offset + nr_args > ctx->buf_table.nr) 237 return -EINVAL; 238 239 for (done = 0; done < nr_args; done++) { 240 struct io_rsrc_node *node; 241 u64 tag = 0; 242 243 uvec = u64_to_user_ptr(user_data); 244 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 245 if (IS_ERR(iov)) { 246 err = PTR_ERR(iov); 247 break; 248 } 249 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 250 err = -EFAULT; 251 break; 252 } 253 err = io_buffer_validate(iov); 254 if (err) 255 break; 256 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 257 if (IS_ERR(node)) { 258 err = PTR_ERR(node); 259 break; 260 } 261 if (tag) { 262 if (!node) { 263 err = -EINVAL; 264 break; 265 } 266 node->tag = tag; 267 } 268 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 269 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 270 ctx->buf_table.nodes[i] = node; 271 if (ctx->compat) 272 user_data += sizeof(struct compat_iovec); 273 else 274 user_data += sizeof(struct iovec); 275 } 276 return done ? done : err; 277 } 278 279 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 280 struct io_uring_rsrc_update2 *up, 281 unsigned nr_args) 282 { 283 __u32 tmp; 284 285 lockdep_assert_held(&ctx->uring_lock); 286 287 if (check_add_overflow(up->offset, nr_args, &tmp)) 288 return -EOVERFLOW; 289 290 switch (type) { 291 case IORING_RSRC_FILE: 292 return __io_sqe_files_update(ctx, up, nr_args); 293 case IORING_RSRC_BUFFER: 294 return __io_sqe_buffers_update(ctx, up, nr_args); 295 } 296 return -EINVAL; 297 } 298 299 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 300 unsigned nr_args) 301 { 302 struct io_uring_rsrc_update2 up; 303 304 if (!nr_args) 305 return -EINVAL; 306 memset(&up, 0, sizeof(up)); 307 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 308 return -EFAULT; 309 if (up.resv || up.resv2) 310 return -EINVAL; 311 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 312 } 313 314 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 315 unsigned size, unsigned type) 316 { 317 struct io_uring_rsrc_update2 up; 318 319 if (size != sizeof(up)) 320 return -EINVAL; 321 if (copy_from_user(&up, arg, sizeof(up))) 322 return -EFAULT; 323 if (!up.nr || up.resv || up.resv2) 324 return -EINVAL; 325 return __io_register_rsrc_update(ctx, type, &up, up.nr); 326 } 327 328 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 329 unsigned int size, unsigned int type) 330 { 331 struct io_uring_rsrc_register rr; 332 333 /* keep it extendible */ 334 if (size != sizeof(rr)) 335 return -EINVAL; 336 337 memset(&rr, 0, sizeof(rr)); 338 if (copy_from_user(&rr, arg, size)) 339 return -EFAULT; 340 if (!rr.nr || rr.resv2) 341 return -EINVAL; 342 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 343 return -EINVAL; 344 345 switch (type) { 346 case IORING_RSRC_FILE: 347 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 348 break; 349 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 350 rr.nr, u64_to_user_ptr(rr.tags)); 351 case IORING_RSRC_BUFFER: 352 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 353 break; 354 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 355 rr.nr, u64_to_user_ptr(rr.tags)); 356 } 357 return -EINVAL; 358 } 359 360 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 361 { 362 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 363 364 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 365 return -EINVAL; 366 if (sqe->rw_flags || sqe->splice_fd_in) 367 return -EINVAL; 368 369 up->offset = READ_ONCE(sqe->off); 370 up->nr_args = READ_ONCE(sqe->len); 371 if (!up->nr_args) 372 return -EINVAL; 373 up->arg = READ_ONCE(sqe->addr); 374 return 0; 375 } 376 377 static int io_files_update_with_index_alloc(struct io_kiocb *req, 378 unsigned int issue_flags) 379 { 380 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 381 __s32 __user *fds = u64_to_user_ptr(up->arg); 382 unsigned int done; 383 struct file *file; 384 int ret, fd; 385 386 if (!req->ctx->file_table.data.nr) 387 return -ENXIO; 388 389 for (done = 0; done < up->nr_args; done++) { 390 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 391 ret = -EFAULT; 392 break; 393 } 394 395 file = fget(fd); 396 if (!file) { 397 ret = -EBADF; 398 break; 399 } 400 ret = io_fixed_fd_install(req, issue_flags, file, 401 IORING_FILE_INDEX_ALLOC); 402 if (ret < 0) 403 break; 404 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 405 __io_close_fixed(req->ctx, issue_flags, ret); 406 ret = -EFAULT; 407 break; 408 } 409 } 410 411 if (done) 412 return done; 413 return ret; 414 } 415 416 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 417 { 418 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 419 struct io_ring_ctx *ctx = req->ctx; 420 struct io_uring_rsrc_update2 up2; 421 int ret; 422 423 up2.offset = up->offset; 424 up2.data = up->arg; 425 up2.nr = 0; 426 up2.tags = 0; 427 up2.resv = 0; 428 up2.resv2 = 0; 429 430 if (up->offset == IORING_FILE_INDEX_ALLOC) { 431 ret = io_files_update_with_index_alloc(req, issue_flags); 432 } else { 433 io_ring_submit_lock(ctx, issue_flags); 434 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 435 &up2, up->nr_args); 436 io_ring_submit_unlock(ctx, issue_flags); 437 } 438 439 if (ret < 0) 440 req_set_fail(req); 441 io_req_set_res(req, ret, 0); 442 return IOU_OK; 443 } 444 445 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 446 { 447 lockdep_assert_held(&ctx->uring_lock); 448 449 if (node->tag) 450 io_post_aux_cqe(ctx, node->tag, 0, 0); 451 452 switch (node->type) { 453 case IORING_RSRC_FILE: 454 if (io_slot_file(node)) 455 fput(io_slot_file(node)); 456 break; 457 case IORING_RSRC_BUFFER: 458 if (node->buf) 459 io_buffer_unmap(ctx, node); 460 break; 461 default: 462 WARN_ON_ONCE(1); 463 break; 464 } 465 466 kfree(node); 467 } 468 469 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 470 { 471 if (!ctx->file_table.data.nr) 472 return -ENXIO; 473 474 io_free_file_tables(ctx, &ctx->file_table); 475 io_file_table_set_alloc_range(ctx, 0, 0); 476 return 0; 477 } 478 479 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 480 unsigned nr_args, u64 __user *tags) 481 { 482 __s32 __user *fds = (__s32 __user *) arg; 483 struct file *file; 484 int fd, ret; 485 unsigned i; 486 487 if (ctx->file_table.data.nr) 488 return -EBUSY; 489 if (!nr_args) 490 return -EINVAL; 491 if (nr_args > IORING_MAX_FIXED_FILES) 492 return -EMFILE; 493 if (nr_args > rlimit(RLIMIT_NOFILE)) 494 return -EMFILE; 495 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 496 return -ENOMEM; 497 498 for (i = 0; i < nr_args; i++) { 499 struct io_rsrc_node *node; 500 u64 tag = 0; 501 502 ret = -EFAULT; 503 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 504 goto fail; 505 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 506 goto fail; 507 /* allow sparse sets */ 508 if (!fds || fd == -1) { 509 ret = -EINVAL; 510 if (tag) 511 goto fail; 512 continue; 513 } 514 515 file = fget(fd); 516 ret = -EBADF; 517 if (unlikely(!file)) 518 goto fail; 519 520 /* 521 * Don't allow io_uring instances to be registered. 522 */ 523 if (io_is_uring_fops(file)) { 524 fput(file); 525 goto fail; 526 } 527 ret = -ENOMEM; 528 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 529 if (!node) { 530 fput(file); 531 goto fail; 532 } 533 if (tag) 534 node->tag = tag; 535 ctx->file_table.data.nodes[i] = node; 536 io_fixed_file_set(node, file); 537 io_file_bitmap_set(&ctx->file_table, i); 538 } 539 540 /* default it to the whole table */ 541 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 542 return 0; 543 fail: 544 io_sqe_files_unregister(ctx); 545 return ret; 546 } 547 548 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 549 { 550 if (!ctx->buf_table.nr) 551 return -ENXIO; 552 io_rsrc_data_free(ctx, &ctx->buf_table); 553 return 0; 554 } 555 556 /* 557 * Not super efficient, but this is just a registration time. And we do cache 558 * the last compound head, so generally we'll only do a full search if we don't 559 * match that one. 560 * 561 * We check if the given compound head page has already been accounted, to 562 * avoid double accounting it. This allows us to account the full size of the 563 * page, not just the constituent pages of a huge page. 564 */ 565 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 566 int nr_pages, struct page *hpage) 567 { 568 int i, j; 569 570 /* check current page array */ 571 for (i = 0; i < nr_pages; i++) { 572 if (!PageCompound(pages[i])) 573 continue; 574 if (compound_head(pages[i]) == hpage) 575 return true; 576 } 577 578 /* check previously registered pages */ 579 for (i = 0; i < ctx->buf_table.nr; i++) { 580 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 581 struct io_mapped_ubuf *imu; 582 583 if (!node) 584 continue; 585 imu = node->buf; 586 for (j = 0; j < imu->nr_bvecs; j++) { 587 if (!PageCompound(imu->bvec[j].bv_page)) 588 continue; 589 if (compound_head(imu->bvec[j].bv_page) == hpage) 590 return true; 591 } 592 } 593 594 return false; 595 } 596 597 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 598 int nr_pages, struct io_mapped_ubuf *imu, 599 struct page **last_hpage) 600 { 601 int i, ret; 602 603 imu->acct_pages = 0; 604 for (i = 0; i < nr_pages; i++) { 605 if (!PageCompound(pages[i])) { 606 imu->acct_pages++; 607 } else { 608 struct page *hpage; 609 610 hpage = compound_head(pages[i]); 611 if (hpage == *last_hpage) 612 continue; 613 *last_hpage = hpage; 614 if (headpage_already_acct(ctx, pages, i, hpage)) 615 continue; 616 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 617 } 618 } 619 620 if (!imu->acct_pages) 621 return 0; 622 623 ret = io_account_mem(ctx, imu->acct_pages); 624 if (ret) 625 imu->acct_pages = 0; 626 return ret; 627 } 628 629 static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, 630 struct io_imu_folio_data *data, int nr_folios) 631 { 632 struct page **page_array = *pages, **new_array = NULL; 633 int nr_pages_left = *nr_pages, i, j; 634 635 /* Store head pages only*/ 636 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), 637 GFP_KERNEL); 638 if (!new_array) 639 return false; 640 641 new_array[0] = compound_head(page_array[0]); 642 /* 643 * The pages are bound to the folio, it doesn't 644 * actually unpin them but drops all but one reference, 645 * which is usually put down by io_buffer_unmap(). 646 * Note, needs a better helper. 647 */ 648 if (data->nr_pages_head > 1) 649 unpin_user_pages(&page_array[1], data->nr_pages_head - 1); 650 651 j = data->nr_pages_head; 652 nr_pages_left -= data->nr_pages_head; 653 for (i = 1; i < nr_folios; i++) { 654 unsigned int nr_unpin; 655 656 new_array[i] = page_array[j]; 657 nr_unpin = min_t(unsigned int, nr_pages_left - 1, 658 data->nr_pages_mid - 1); 659 if (nr_unpin) 660 unpin_user_pages(&page_array[j+1], nr_unpin); 661 j += data->nr_pages_mid; 662 nr_pages_left -= data->nr_pages_mid; 663 } 664 kvfree(page_array); 665 *pages = new_array; 666 *nr_pages = nr_folios; 667 return true; 668 } 669 670 static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, 671 struct io_imu_folio_data *data) 672 { 673 struct page **page_array = *pages; 674 struct folio *folio = page_folio(page_array[0]); 675 unsigned int count = 1, nr_folios = 1; 676 int i; 677 678 if (*nr_pages <= 1) 679 return false; 680 681 data->nr_pages_mid = folio_nr_pages(folio); 682 if (data->nr_pages_mid == 1) 683 return false; 684 685 data->folio_shift = folio_shift(folio); 686 /* 687 * Check if pages are contiguous inside a folio, and all folios have 688 * the same page count except for the head and tail. 689 */ 690 for (i = 1; i < *nr_pages; i++) { 691 if (page_folio(page_array[i]) == folio && 692 page_array[i] == page_array[i-1] + 1) { 693 count++; 694 continue; 695 } 696 697 if (nr_folios == 1) { 698 if (folio_page_idx(folio, page_array[i-1]) != 699 data->nr_pages_mid - 1) 700 return false; 701 702 data->nr_pages_head = count; 703 } else if (count != data->nr_pages_mid) { 704 return false; 705 } 706 707 folio = page_folio(page_array[i]); 708 if (folio_size(folio) != (1UL << data->folio_shift) || 709 folio_page_idx(folio, page_array[i]) != 0) 710 return false; 711 712 count = 1; 713 nr_folios++; 714 } 715 if (nr_folios == 1) 716 data->nr_pages_head = count; 717 718 return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); 719 } 720 721 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 722 struct iovec *iov, 723 struct page **last_hpage) 724 { 725 struct io_mapped_ubuf *imu = NULL; 726 struct page **pages = NULL; 727 struct io_rsrc_node *node; 728 unsigned long off; 729 size_t size; 730 int ret, nr_pages, i; 731 struct io_imu_folio_data data; 732 bool coalesced; 733 734 if (!iov->iov_base) 735 return NULL; 736 737 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 738 if (!node) 739 return ERR_PTR(-ENOMEM); 740 node->buf = NULL; 741 742 ret = -ENOMEM; 743 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 744 &nr_pages); 745 if (IS_ERR(pages)) { 746 ret = PTR_ERR(pages); 747 pages = NULL; 748 goto done; 749 } 750 751 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 752 coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data); 753 754 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 755 if (!imu) 756 goto done; 757 758 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 759 if (ret) { 760 unpin_user_pages(pages, nr_pages); 761 goto done; 762 } 763 764 size = iov->iov_len; 765 /* store original address for later verification */ 766 imu->ubuf = (unsigned long) iov->iov_base; 767 imu->len = iov->iov_len; 768 imu->nr_bvecs = nr_pages; 769 imu->folio_shift = PAGE_SHIFT; 770 if (coalesced) 771 imu->folio_shift = data.folio_shift; 772 refcount_set(&imu->refs, 1); 773 off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); 774 node->buf = imu; 775 ret = 0; 776 777 for (i = 0; i < nr_pages; i++) { 778 size_t vec_len; 779 780 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 781 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 782 off = 0; 783 size -= vec_len; 784 } 785 done: 786 if (ret) { 787 kvfree(imu); 788 if (node) 789 io_put_rsrc_node(ctx, node); 790 node = ERR_PTR(ret); 791 } 792 kvfree(pages); 793 return node; 794 } 795 796 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 797 unsigned int nr_args, u64 __user *tags) 798 { 799 struct page *last_hpage = NULL; 800 struct io_rsrc_data data; 801 struct iovec fast_iov, *iov = &fast_iov; 802 const struct iovec __user *uvec; 803 int i, ret; 804 805 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 806 807 if (ctx->buf_table.nr) 808 return -EBUSY; 809 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 810 return -EINVAL; 811 ret = io_rsrc_data_alloc(&data, nr_args); 812 if (ret) 813 return ret; 814 815 if (!arg) 816 memset(iov, 0, sizeof(*iov)); 817 818 for (i = 0; i < nr_args; i++) { 819 struct io_rsrc_node *node; 820 u64 tag = 0; 821 822 if (arg) { 823 uvec = (struct iovec __user *) arg; 824 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 825 if (IS_ERR(iov)) { 826 ret = PTR_ERR(iov); 827 break; 828 } 829 ret = io_buffer_validate(iov); 830 if (ret) 831 break; 832 if (ctx->compat) 833 arg += sizeof(struct compat_iovec); 834 else 835 arg += sizeof(struct iovec); 836 } 837 838 if (tags) { 839 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 840 ret = -EFAULT; 841 break; 842 } 843 } 844 845 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 846 if (IS_ERR(node)) { 847 ret = PTR_ERR(node); 848 break; 849 } 850 if (tag) { 851 if (!node) { 852 ret = -EINVAL; 853 break; 854 } 855 node->tag = tag; 856 } 857 data.nodes[i] = node; 858 } 859 860 ctx->buf_table = data; 861 if (ret) 862 io_sqe_buffers_unregister(ctx); 863 return ret; 864 } 865 866 int io_import_fixed(int ddir, struct iov_iter *iter, 867 struct io_mapped_ubuf *imu, 868 u64 buf_addr, size_t len) 869 { 870 u64 buf_end; 871 size_t offset; 872 873 if (WARN_ON_ONCE(!imu)) 874 return -EFAULT; 875 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 876 return -EFAULT; 877 /* not inside the mapped region */ 878 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 879 return -EFAULT; 880 881 /* 882 * Might not be a start of buffer, set size appropriately 883 * and advance us to the beginning. 884 */ 885 offset = buf_addr - imu->ubuf; 886 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 887 888 if (offset) { 889 /* 890 * Don't use iov_iter_advance() here, as it's really slow for 891 * using the latter parts of a big fixed buffer - it iterates 892 * over each segment manually. We can cheat a bit here, because 893 * we know that: 894 * 895 * 1) it's a BVEC iter, we set it up 896 * 2) all bvecs are the same in size, except potentially the 897 * first and last bvec 898 * 899 * So just find our index, and adjust the iterator afterwards. 900 * If the offset is within the first bvec (or the whole first 901 * bvec, just use iov_iter_advance(). This makes it easier 902 * since we can just skip the first segment, which may not 903 * be folio_size aligned. 904 */ 905 const struct bio_vec *bvec = imu->bvec; 906 907 if (offset < bvec->bv_len) { 908 iter->count -= offset; 909 iter->iov_offset = offset; 910 } else { 911 unsigned long seg_skip; 912 913 /* skip first vec */ 914 offset -= bvec->bv_len; 915 seg_skip = 1 + (offset >> imu->folio_shift); 916 917 iter->bvec += seg_skip; 918 iter->nr_segs -= seg_skip; 919 iter->count -= bvec->bv_len + offset; 920 iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); 921 } 922 } 923 924 return 0; 925 } 926 927 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 928 struct io_uring_clone_buffers *arg) 929 { 930 struct io_rsrc_data data; 931 int i, ret, off, nr; 932 unsigned int nbufs; 933 934 /* if offsets are given, must have nr specified too */ 935 if (!arg->nr && (arg->dst_off || arg->src_off)) 936 return -EINVAL; 937 /* not allowed unless REPLACE is set */ 938 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 939 return -EBUSY; 940 941 nbufs = READ_ONCE(src_ctx->buf_table.nr); 942 if (!arg->nr) 943 arg->nr = nbufs; 944 else if (arg->nr > nbufs) 945 return -EINVAL; 946 else if (arg->nr > IORING_MAX_REG_BUFFERS) 947 return -EINVAL; 948 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 949 return -EOVERFLOW; 950 951 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 952 if (ret) 953 return ret; 954 955 /* Fill entries in data from dst that won't overlap with src */ 956 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 957 struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; 958 959 if (src_node) { 960 data.nodes[i] = src_node; 961 src_node->refs++; 962 } 963 } 964 965 /* 966 * Drop our own lock here. We'll setup the data we need and reference 967 * the source buffers, then re-grab, check, and assign at the end. 968 */ 969 mutex_unlock(&ctx->uring_lock); 970 971 mutex_lock(&src_ctx->uring_lock); 972 ret = -ENXIO; 973 nbufs = src_ctx->buf_table.nr; 974 if (!nbufs) 975 goto out_unlock; 976 ret = -EINVAL; 977 if (!arg->nr) 978 arg->nr = nbufs; 979 else if (arg->nr > nbufs) 980 goto out_unlock; 981 ret = -EOVERFLOW; 982 if (check_add_overflow(arg->nr, arg->src_off, &off)) 983 goto out_unlock; 984 if (off > nbufs) 985 goto out_unlock; 986 987 off = arg->dst_off; 988 i = arg->src_off; 989 nr = arg->nr; 990 while (nr--) { 991 struct io_rsrc_node *dst_node, *src_node; 992 993 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 994 if (!src_node) { 995 dst_node = NULL; 996 } else { 997 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 998 if (!dst_node) { 999 ret = -ENOMEM; 1000 goto out_put_free; 1001 } 1002 1003 refcount_inc(&src_node->buf->refs); 1004 dst_node->buf = src_node->buf; 1005 } 1006 data.nodes[off++] = dst_node; 1007 i++; 1008 } 1009 1010 /* Have a ref on the bufs now, drop src lock and re-grab our own lock */ 1011 mutex_unlock(&src_ctx->uring_lock); 1012 mutex_lock(&ctx->uring_lock); 1013 1014 /* 1015 * If asked for replace, put the old table. data->nodes[] holds both 1016 * old and new nodes at this point. 1017 */ 1018 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1019 io_rsrc_data_free(ctx, &ctx->buf_table); 1020 1021 /* 1022 * ctx->buf_table should be empty now - either the contents are being 1023 * replaced and we just freed the table, or someone raced setting up 1024 * a buffer table while the clone was happening. If not empty, fall 1025 * through to failure handling. 1026 */ 1027 if (!ctx->buf_table.nr) { 1028 ctx->buf_table = data; 1029 return 0; 1030 } 1031 1032 mutex_unlock(&ctx->uring_lock); 1033 mutex_lock(&src_ctx->uring_lock); 1034 /* someone raced setting up buffers, dump ours */ 1035 ret = -EBUSY; 1036 out_put_free: 1037 i = data.nr; 1038 while (i--) { 1039 io_buffer_unmap(src_ctx, data.nodes[i]); 1040 kfree(data.nodes[i]); 1041 } 1042 out_unlock: 1043 io_rsrc_data_free(ctx, &data); 1044 mutex_unlock(&src_ctx->uring_lock); 1045 mutex_lock(&ctx->uring_lock); 1046 return ret; 1047 } 1048 1049 /* 1050 * Copy the registered buffers from the source ring whose file descriptor 1051 * is given in the src_fd to the current ring. This is identical to registering 1052 * the buffers with ctx, except faster as mappings already exist. 1053 * 1054 * Since the memory is already accounted once, don't account it again. 1055 */ 1056 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1057 { 1058 struct io_uring_clone_buffers buf; 1059 bool registered_src; 1060 struct file *file; 1061 int ret; 1062 1063 if (copy_from_user(&buf, arg, sizeof(buf))) 1064 return -EFAULT; 1065 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1066 return -EINVAL; 1067 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1068 return -EBUSY; 1069 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1070 return -EINVAL; 1071 1072 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1073 file = io_uring_register_get_file(buf.src_fd, registered_src); 1074 if (IS_ERR(file)) 1075 return PTR_ERR(file); 1076 ret = io_clone_buffers(ctx, file->private_data, &buf); 1077 if (!registered_src) 1078 fput(file); 1079 return ret; 1080 } 1081