1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 #include <linux/io_uring/cmd.h> 13 14 #include <uapi/linux/io_uring.h> 15 16 #include "filetable.h" 17 #include "io_uring.h" 18 #include "openclose.h" 19 #include "rsrc.h" 20 #include "memmap.h" 21 #include "register.h" 22 23 struct io_rsrc_update { 24 struct file *file; 25 u64 arg; 26 u32 nr_args; 27 u32 offset; 28 }; 29 30 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 31 struct iovec *iov, struct page **last_hpage); 32 33 /* only define max */ 34 #define IORING_MAX_FIXED_FILES (1U << 20) 35 #define IORING_MAX_REG_BUFFERS (1U << 14) 36 37 #define IO_CACHED_BVECS_SEGS 32 38 39 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 40 { 41 unsigned long page_limit, cur_pages, new_pages; 42 43 if (!nr_pages) 44 return 0; 45 46 /* Don't allow more pages than we can safely lock */ 47 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 48 49 cur_pages = atomic_long_read(&user->locked_vm); 50 do { 51 new_pages = cur_pages + nr_pages; 52 if (new_pages > page_limit) 53 return -ENOMEM; 54 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 55 &cur_pages, new_pages)); 56 return 0; 57 } 58 59 void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, 60 unsigned long nr_pages) 61 { 62 if (user) 63 __io_unaccount_mem(user, nr_pages); 64 65 if (mm_account) 66 atomic64_sub(nr_pages, &mm_account->pinned_vm); 67 } 68 69 int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, 70 unsigned long nr_pages) 71 { 72 int ret; 73 74 if (user) { 75 ret = __io_account_mem(user, nr_pages); 76 if (ret) 77 return ret; 78 } 79 80 if (mm_account) 81 atomic64_add(nr_pages, &mm_account->pinned_vm); 82 83 return 0; 84 } 85 86 int io_validate_user_buf_range(u64 uaddr, u64 ulen) 87 { 88 unsigned long tmp, base = (unsigned long)uaddr; 89 unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); 90 91 /* arbitrary limit, but we need something */ 92 if (ulen > SZ_1G || !ulen) 93 return -EFAULT; 94 if (check_add_overflow(base, acct_len, &tmp)) 95 return -EOVERFLOW; 96 return 0; 97 } 98 99 static void io_release_ubuf(void *priv) 100 { 101 struct io_mapped_ubuf *imu = priv; 102 unsigned int i; 103 104 for (i = 0; i < imu->nr_bvecs; i++) { 105 struct folio *folio = page_folio(imu->bvec[i].bv_page); 106 107 unpin_user_folio(folio, 1); 108 } 109 } 110 111 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 112 int nr_bvecs) 113 { 114 if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 115 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 116 return kvmalloc_flex(struct io_mapped_ubuf, bvec, nr_bvecs); 117 } 118 119 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 120 { 121 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 122 io_cache_free(&ctx->imu_cache, imu); 123 else 124 kvfree(imu); 125 } 126 127 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 128 { 129 if (unlikely(refcount_read(&imu->refs) > 1)) { 130 if (!refcount_dec_and_test(&imu->refs)) 131 return; 132 } 133 134 if (imu->acct_pages) 135 io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages); 136 imu->release(imu->priv); 137 io_free_imu(ctx, imu); 138 } 139 140 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 141 { 142 struct io_rsrc_node *node; 143 144 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 145 if (node) { 146 node->type = type; 147 node->refs = 1; 148 node->tag = 0; 149 node->file_ptr = 0; 150 } 151 return node; 152 } 153 154 bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 155 { 156 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 157 IO_CACHED_BVECS_SEGS); 158 const int node_size = sizeof(struct io_rsrc_node); 159 bool ret; 160 161 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 162 node_size, 0); 163 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 164 imu_cache_size, 0); 165 return ret; 166 } 167 168 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 169 { 170 io_alloc_cache_free(&ctx->node_cache, kfree); 171 io_alloc_cache_free(&ctx->imu_cache, kvfree); 172 } 173 174 static void io_clear_table_tags(struct io_rsrc_data *data) 175 { 176 int i; 177 178 for (i = 0; i < data->nr; i++) { 179 struct io_rsrc_node *node = data->nodes[i]; 180 181 if (node) 182 node->tag = 0; 183 } 184 } 185 186 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 187 struct io_rsrc_data *data) 188 { 189 if (!data->nr) 190 return; 191 while (data->nr--) { 192 if (data->nodes[data->nr]) 193 io_put_rsrc_node(ctx, data->nodes[data->nr]); 194 } 195 kvfree(data->nodes); 196 data->nodes = NULL; 197 data->nr = 0; 198 } 199 200 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 201 { 202 data->nodes = kvmalloc_objs(struct io_rsrc_node *, nr, 203 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 204 if (data->nodes) { 205 data->nr = nr; 206 return 0; 207 } 208 return -ENOMEM; 209 } 210 211 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 212 struct io_uring_rsrc_update2 *up, 213 unsigned nr_args) 214 { 215 u64 __user *tags = u64_to_user_ptr(up->tags); 216 __s32 __user *fds = u64_to_user_ptr(up->data); 217 int fd, i, err = 0; 218 unsigned int done; 219 220 if (!ctx->file_table.data.nr) 221 return -ENXIO; 222 if (up->offset + nr_args > ctx->file_table.data.nr) 223 return -EINVAL; 224 225 for (done = 0; done < nr_args; done++) { 226 u64 tag = 0; 227 228 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 229 copy_from_user(&fd, &fds[done], sizeof(fd))) { 230 err = -EFAULT; 231 break; 232 } 233 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 234 err = -EINVAL; 235 break; 236 } 237 if (fd == IORING_REGISTER_FILES_SKIP) 238 continue; 239 240 i = up->offset + done; 241 if (i >= ctx->file_table.data.nr) 242 break; 243 i = array_index_nospec(i, ctx->file_table.data.nr); 244 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 245 io_file_bitmap_clear(&ctx->file_table, i); 246 247 if (fd != -1) { 248 struct file *file = fget(fd); 249 struct io_rsrc_node *node; 250 251 if (!file) { 252 err = -EBADF; 253 break; 254 } 255 /* 256 * Don't allow io_uring instances to be registered. 257 */ 258 if (io_is_uring_fops(file)) { 259 fput(file); 260 err = -EBADF; 261 break; 262 } 263 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 264 if (!node) { 265 err = -ENOMEM; 266 fput(file); 267 break; 268 } 269 ctx->file_table.data.nodes[i] = node; 270 if (tag) 271 node->tag = tag; 272 io_fixed_file_set(node, file); 273 io_file_bitmap_set(&ctx->file_table, i); 274 } 275 } 276 return done ? done : err; 277 } 278 279 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 280 struct io_uring_rsrc_update2 *up, 281 unsigned int nr_args) 282 { 283 u64 __user *tags = u64_to_user_ptr(up->tags); 284 struct iovec fast_iov, *iov; 285 struct page *last_hpage = NULL; 286 struct iovec __user *uvec; 287 u64 user_data = up->data; 288 __u32 done; 289 int i, err; 290 291 if (!ctx->buf_table.nr) 292 return -ENXIO; 293 if (up->offset + nr_args > ctx->buf_table.nr) 294 return -EINVAL; 295 296 for (done = 0; done < nr_args; done++) { 297 struct io_rsrc_node *node; 298 u64 tag = 0; 299 300 uvec = u64_to_user_ptr(user_data); 301 iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); 302 if (IS_ERR(iov)) { 303 err = PTR_ERR(iov); 304 break; 305 } 306 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 307 err = -EFAULT; 308 break; 309 } 310 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 311 if (IS_ERR(node)) { 312 err = PTR_ERR(node); 313 break; 314 } 315 if (tag) { 316 if (!node) { 317 err = -EINVAL; 318 break; 319 } 320 node->tag = tag; 321 } 322 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 323 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 324 ctx->buf_table.nodes[i] = node; 325 if (io_is_compat(ctx)) 326 user_data += sizeof(struct compat_iovec); 327 else 328 user_data += sizeof(struct iovec); 329 } 330 return done ? done : err; 331 } 332 333 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 334 struct io_uring_rsrc_update2 *up, 335 unsigned nr_args) 336 { 337 __u32 tmp; 338 339 lockdep_assert_held(&ctx->uring_lock); 340 341 if (check_add_overflow(up->offset, nr_args, &tmp)) 342 return -EOVERFLOW; 343 344 switch (type) { 345 case IORING_RSRC_FILE: 346 return __io_sqe_files_update(ctx, up, nr_args); 347 case IORING_RSRC_BUFFER: 348 return __io_sqe_buffers_update(ctx, up, nr_args); 349 } 350 return -EINVAL; 351 } 352 353 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 354 unsigned nr_args) 355 { 356 struct io_uring_rsrc_update2 up; 357 358 if (!nr_args) 359 return -EINVAL; 360 memset(&up, 0, sizeof(up)); 361 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 362 return -EFAULT; 363 if (up.resv || up.resv2) 364 return -EINVAL; 365 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 366 } 367 368 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 369 unsigned size, unsigned type) 370 { 371 struct io_uring_rsrc_update2 up; 372 373 if (size != sizeof(up)) 374 return -EINVAL; 375 if (copy_from_user(&up, arg, sizeof(up))) 376 return -EFAULT; 377 if (!up.nr || up.resv || up.resv2) 378 return -EINVAL; 379 return __io_register_rsrc_update(ctx, type, &up, up.nr); 380 } 381 382 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 383 unsigned int size, unsigned int type) 384 { 385 struct io_uring_rsrc_register rr; 386 387 /* keep it extendible */ 388 if (size != sizeof(rr)) 389 return -EINVAL; 390 391 memset(&rr, 0, sizeof(rr)); 392 if (copy_from_user(&rr, arg, size)) 393 return -EFAULT; 394 if (!rr.nr || rr.resv2) 395 return -EINVAL; 396 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 397 return -EINVAL; 398 399 switch (type) { 400 case IORING_RSRC_FILE: 401 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 402 break; 403 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 404 rr.nr, u64_to_user_ptr(rr.tags)); 405 case IORING_RSRC_BUFFER: 406 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 407 break; 408 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 409 rr.nr, u64_to_user_ptr(rr.tags)); 410 } 411 return -EINVAL; 412 } 413 414 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 415 { 416 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 417 418 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 419 return -EINVAL; 420 if (sqe->rw_flags || sqe->splice_fd_in) 421 return -EINVAL; 422 423 up->offset = READ_ONCE(sqe->off); 424 up->nr_args = READ_ONCE(sqe->len); 425 if (!up->nr_args) 426 return -EINVAL; 427 up->arg = READ_ONCE(sqe->addr); 428 return 0; 429 } 430 431 static int io_files_update_with_index_alloc(struct io_kiocb *req, 432 unsigned int issue_flags) 433 { 434 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 435 __s32 __user *fds = u64_to_user_ptr(up->arg); 436 unsigned int done; 437 struct file *file; 438 int ret, fd; 439 440 if (!req->ctx->file_table.data.nr) 441 return -ENXIO; 442 443 for (done = 0; done < up->nr_args; done++) { 444 if (get_user(fd, &fds[done])) { 445 ret = -EFAULT; 446 break; 447 } 448 449 file = fget(fd); 450 if (!file) { 451 ret = -EBADF; 452 break; 453 } 454 ret = io_fixed_fd_install(req, issue_flags, file, 455 IORING_FILE_INDEX_ALLOC); 456 if (ret < 0) 457 break; 458 if (put_user(ret, &fds[done])) { 459 __io_close_fixed(req->ctx, issue_flags, ret); 460 ret = -EFAULT; 461 break; 462 } 463 } 464 465 if (done) 466 return done; 467 return ret; 468 } 469 470 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 471 { 472 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 473 struct io_ring_ctx *ctx = req->ctx; 474 struct io_uring_rsrc_update2 up2; 475 int ret; 476 477 up2.offset = up->offset; 478 up2.data = up->arg; 479 up2.nr = 0; 480 up2.tags = 0; 481 up2.resv = 0; 482 up2.resv2 = 0; 483 484 if (up->offset == IORING_FILE_INDEX_ALLOC) { 485 ret = io_files_update_with_index_alloc(req, issue_flags); 486 } else { 487 io_ring_submit_lock(ctx, issue_flags); 488 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 489 &up2, up->nr_args); 490 io_ring_submit_unlock(ctx, issue_flags); 491 } 492 493 if (ret < 0) 494 req_set_fail(req); 495 io_req_set_res(req, ret, 0); 496 return IOU_COMPLETE; 497 } 498 499 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 500 { 501 if (node->tag) 502 io_post_aux_cqe(ctx, node->tag, 0, 0); 503 504 switch (node->type) { 505 case IORING_RSRC_FILE: 506 fput(io_slot_file(node)); 507 break; 508 case IORING_RSRC_BUFFER: 509 io_buffer_unmap(ctx, node->buf); 510 break; 511 default: 512 WARN_ON_ONCE(1); 513 break; 514 } 515 516 io_cache_free(&ctx->node_cache, node); 517 } 518 519 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 520 { 521 if (!ctx->file_table.data.nr) 522 return -ENXIO; 523 524 io_free_file_tables(ctx, &ctx->file_table); 525 io_file_table_set_alloc_range(ctx, 0, 0); 526 return 0; 527 } 528 529 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 530 unsigned nr_args, u64 __user *tags) 531 { 532 __s32 __user *fds = (__s32 __user *) arg; 533 struct file *file; 534 int fd, ret; 535 unsigned i; 536 537 if (ctx->file_table.data.nr) 538 return -EBUSY; 539 if (!nr_args) 540 return -EINVAL; 541 if (nr_args > IORING_MAX_FIXED_FILES) 542 return -EMFILE; 543 if (nr_args > rlimit(RLIMIT_NOFILE)) 544 return -EMFILE; 545 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 546 return -ENOMEM; 547 548 for (i = 0; i < nr_args; i++) { 549 struct io_rsrc_node *node; 550 u64 tag = 0; 551 552 ret = -EFAULT; 553 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 554 goto fail; 555 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 556 goto fail; 557 /* allow sparse sets */ 558 if (!fds || fd == -1) { 559 ret = -EINVAL; 560 if (tag) 561 goto fail; 562 continue; 563 } 564 565 file = fget(fd); 566 ret = -EBADF; 567 if (unlikely(!file)) 568 goto fail; 569 570 /* 571 * Don't allow io_uring instances to be registered. 572 */ 573 if (io_is_uring_fops(file)) { 574 fput(file); 575 goto fail; 576 } 577 ret = -ENOMEM; 578 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 579 if (!node) { 580 fput(file); 581 goto fail; 582 } 583 if (tag) 584 node->tag = tag; 585 ctx->file_table.data.nodes[i] = node; 586 io_fixed_file_set(node, file); 587 io_file_bitmap_set(&ctx->file_table, i); 588 } 589 590 /* default it to the whole table */ 591 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 592 return 0; 593 fail: 594 io_clear_table_tags(&ctx->file_table.data); 595 io_sqe_files_unregister(ctx); 596 return ret; 597 } 598 599 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 600 { 601 if (!ctx->buf_table.nr) 602 return -ENXIO; 603 io_rsrc_data_free(ctx, &ctx->buf_table); 604 return 0; 605 } 606 607 /* 608 * Not super efficient, but this is just a registration time. And we do cache 609 * the last compound head, so generally we'll only do a full search if we don't 610 * match that one. 611 * 612 * We check if the given compound head page has already been accounted, to 613 * avoid double accounting it. This allows us to account the full size of the 614 * page, not just the constituent pages of a huge page. 615 */ 616 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 617 int nr_pages, struct page *hpage) 618 { 619 int i, j; 620 621 /* check current page array */ 622 for (i = 0; i < nr_pages; i++) { 623 if (!PageCompound(pages[i])) 624 continue; 625 if (compound_head(pages[i]) == hpage) 626 return true; 627 } 628 629 /* check previously registered pages */ 630 for (i = 0; i < ctx->buf_table.nr; i++) { 631 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 632 struct io_mapped_ubuf *imu; 633 634 if (!node) 635 continue; 636 imu = node->buf; 637 for (j = 0; j < imu->nr_bvecs; j++) { 638 if (!PageCompound(imu->bvec[j].bv_page)) 639 continue; 640 if (compound_head(imu->bvec[j].bv_page) == hpage) 641 return true; 642 } 643 } 644 645 return false; 646 } 647 648 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 649 int nr_pages, struct io_mapped_ubuf *imu, 650 struct page **last_hpage) 651 { 652 int i, ret; 653 654 imu->acct_pages = 0; 655 for (i = 0; i < nr_pages; i++) { 656 if (!PageCompound(pages[i])) { 657 imu->acct_pages++; 658 } else { 659 struct page *hpage; 660 661 hpage = compound_head(pages[i]); 662 if (hpage == *last_hpage) 663 continue; 664 *last_hpage = hpage; 665 if (headpage_already_acct(ctx, pages, i, hpage)) 666 continue; 667 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 668 } 669 } 670 671 if (!imu->acct_pages) 672 return 0; 673 674 ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages); 675 if (ret) 676 imu->acct_pages = 0; 677 return ret; 678 } 679 680 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 681 struct io_imu_folio_data *data) 682 { 683 struct page **page_array = *pages, **new_array = NULL; 684 unsigned nr_pages_left = *nr_pages; 685 unsigned nr_folios = data->nr_folios; 686 unsigned i, j; 687 688 /* Store head pages only*/ 689 new_array = kvmalloc_objs(struct page *, nr_folios); 690 if (!new_array) 691 return false; 692 693 for (i = 0, j = 0; i < nr_folios; i++) { 694 struct page *p = compound_head(page_array[j]); 695 struct folio *folio = page_folio(p); 696 unsigned int nr; 697 698 WARN_ON_ONCE(i > 0 && p != page_array[j]); 699 700 nr = i ? data->nr_pages_mid : data->nr_pages_head; 701 nr = min(nr, nr_pages_left); 702 /* Drop all but one ref, the entire folio will remain pinned. */ 703 if (nr > 1) 704 unpin_user_folio(folio, nr - 1); 705 j += nr; 706 nr_pages_left -= nr; 707 new_array[i] = p; 708 } 709 710 WARN_ON_ONCE(j != *nr_pages); 711 712 kvfree(page_array); 713 *pages = new_array; 714 *nr_pages = nr_folios; 715 return true; 716 } 717 718 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 719 struct io_imu_folio_data *data) 720 { 721 struct folio *folio = page_folio(page_array[0]); 722 unsigned int count = 1, nr_folios = 1; 723 int i; 724 725 data->nr_pages_mid = folio_nr_pages(folio); 726 data->folio_shift = folio_shift(folio); 727 data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); 728 729 /* 730 * Check if pages are contiguous inside a folio, and all folios have 731 * the same page count except for the head and tail. 732 */ 733 for (i = 1; i < nr_pages; i++) { 734 if (page_folio(page_array[i]) == folio && 735 page_array[i] == page_array[i-1] + 1) { 736 count++; 737 continue; 738 } 739 740 if (nr_folios == 1) { 741 if (folio_page_idx(folio, page_array[i-1]) != 742 data->nr_pages_mid - 1) 743 return false; 744 745 data->nr_pages_head = count; 746 } else if (count != data->nr_pages_mid) { 747 return false; 748 } 749 750 folio = page_folio(page_array[i]); 751 if (folio_size(folio) != (1UL << data->folio_shift) || 752 folio_page_idx(folio, page_array[i]) != 0) 753 return false; 754 755 count = 1; 756 nr_folios++; 757 } 758 if (nr_folios == 1) 759 data->nr_pages_head = count; 760 761 data->nr_folios = nr_folios; 762 return true; 763 } 764 765 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 766 struct iovec *iov, 767 struct page **last_hpage) 768 { 769 struct io_mapped_ubuf *imu = NULL; 770 struct page **pages = NULL; 771 struct io_rsrc_node *node; 772 unsigned long off; 773 size_t size; 774 int ret, nr_pages, i; 775 struct io_imu_folio_data data; 776 bool coalesced = false; 777 778 if (!iov->iov_base) { 779 if (iov->iov_len) 780 return ERR_PTR(-EFAULT); 781 /* remove the buffer without installing a new one */ 782 return NULL; 783 } 784 785 ret = io_validate_user_buf_range((unsigned long)iov->iov_base, 786 iov->iov_len); 787 if (ret) 788 return ERR_PTR(ret); 789 790 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 791 if (!node) 792 return ERR_PTR(-ENOMEM); 793 794 ret = -ENOMEM; 795 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 796 &nr_pages); 797 if (IS_ERR(pages)) { 798 ret = PTR_ERR(pages); 799 pages = NULL; 800 goto done; 801 } 802 803 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 804 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 805 if (data.nr_pages_mid != 1) 806 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 807 } 808 809 imu = io_alloc_imu(ctx, nr_pages); 810 if (!imu) 811 goto done; 812 813 imu->nr_bvecs = nr_pages; 814 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 815 if (ret) 816 goto done; 817 818 size = iov->iov_len; 819 /* store original address for later verification */ 820 imu->ubuf = (unsigned long) iov->iov_base; 821 imu->len = iov->iov_len; 822 imu->folio_shift = PAGE_SHIFT; 823 imu->release = io_release_ubuf; 824 imu->priv = imu; 825 imu->flags = 0; 826 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 827 if (coalesced) 828 imu->folio_shift = data.folio_shift; 829 refcount_set(&imu->refs, 1); 830 831 off = (unsigned long)iov->iov_base & ~PAGE_MASK; 832 if (coalesced) 833 off += data.first_folio_page_idx << PAGE_SHIFT; 834 835 node->buf = imu; 836 ret = 0; 837 838 for (i = 0; i < nr_pages; i++) { 839 size_t vec_len; 840 841 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 842 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 843 off = 0; 844 size -= vec_len; 845 } 846 done: 847 if (ret) { 848 if (imu) 849 io_free_imu(ctx, imu); 850 if (pages) { 851 for (i = 0; i < nr_pages; i++) 852 unpin_user_folio(page_folio(pages[i]), 1); 853 } 854 io_cache_free(&ctx->node_cache, node); 855 node = ERR_PTR(ret); 856 } 857 kvfree(pages); 858 return node; 859 } 860 861 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 862 unsigned int nr_args, u64 __user *tags) 863 { 864 struct page *last_hpage = NULL; 865 struct io_rsrc_data data; 866 struct iovec fast_iov, *iov = &fast_iov; 867 const struct iovec __user *uvec; 868 int i, ret; 869 870 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 871 872 if (ctx->buf_table.nr) 873 return -EBUSY; 874 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 875 return -EINVAL; 876 ret = io_rsrc_data_alloc(&data, nr_args); 877 if (ret) 878 return ret; 879 880 if (!arg) 881 memset(iov, 0, sizeof(*iov)); 882 883 for (i = 0; i < nr_args; i++) { 884 struct io_rsrc_node *node; 885 u64 tag = 0; 886 887 if (arg) { 888 uvec = (struct iovec __user *) arg; 889 iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); 890 if (IS_ERR(iov)) { 891 ret = PTR_ERR(iov); 892 break; 893 } 894 if (io_is_compat(ctx)) 895 arg += sizeof(struct compat_iovec); 896 else 897 arg += sizeof(struct iovec); 898 } 899 900 if (tags) { 901 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 902 ret = -EFAULT; 903 break; 904 } 905 } 906 907 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 908 if (IS_ERR(node)) { 909 ret = PTR_ERR(node); 910 break; 911 } 912 if (tag) { 913 if (!node) { 914 ret = -EINVAL; 915 break; 916 } 917 node->tag = tag; 918 } 919 data.nodes[i] = node; 920 } 921 922 ctx->buf_table = data; 923 if (ret) { 924 io_clear_table_tags(&ctx->buf_table); 925 io_sqe_buffers_unregister(ctx); 926 } 927 return ret; 928 } 929 930 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 931 void (*release)(void *), unsigned int index, 932 unsigned int issue_flags) 933 { 934 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 935 struct io_rsrc_data *data = &ctx->buf_table; 936 struct req_iterator rq_iter; 937 struct io_mapped_ubuf *imu; 938 struct io_rsrc_node *node; 939 struct bio_vec bv; 940 unsigned int nr_bvecs = 0; 941 int ret = 0; 942 943 io_ring_submit_lock(ctx, issue_flags); 944 if (index >= data->nr) { 945 ret = -EINVAL; 946 goto unlock; 947 } 948 index = array_index_nospec(index, data->nr); 949 950 if (data->nodes[index]) { 951 ret = -EBUSY; 952 goto unlock; 953 } 954 955 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 956 if (!node) { 957 ret = -ENOMEM; 958 goto unlock; 959 } 960 961 /* 962 * blk_rq_nr_phys_segments() may overestimate the number of bvecs 963 * but avoids needing to iterate over the bvecs 964 */ 965 imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); 966 if (!imu) { 967 io_cache_free(&ctx->node_cache, node); 968 ret = -ENOMEM; 969 goto unlock; 970 } 971 972 imu->ubuf = 0; 973 imu->len = blk_rq_bytes(rq); 974 imu->acct_pages = 0; 975 imu->folio_shift = PAGE_SHIFT; 976 refcount_set(&imu->refs, 1); 977 imu->release = release; 978 imu->priv = rq; 979 imu->flags = IO_REGBUF_F_KBUF; 980 imu->dir = 1 << rq_data_dir(rq); 981 982 rq_for_each_bvec(bv, rq, rq_iter) 983 imu->bvec[nr_bvecs++] = bv; 984 imu->nr_bvecs = nr_bvecs; 985 986 node->buf = imu; 987 data->nodes[index] = node; 988 unlock: 989 io_ring_submit_unlock(ctx, issue_flags); 990 return ret; 991 } 992 EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 993 994 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 995 unsigned int issue_flags) 996 { 997 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 998 struct io_rsrc_data *data = &ctx->buf_table; 999 struct io_rsrc_node *node; 1000 int ret = 0; 1001 1002 io_ring_submit_lock(ctx, issue_flags); 1003 if (index >= data->nr) { 1004 ret = -EINVAL; 1005 goto unlock; 1006 } 1007 index = array_index_nospec(index, data->nr); 1008 1009 node = data->nodes[index]; 1010 if (!node) { 1011 ret = -EINVAL; 1012 goto unlock; 1013 } 1014 if (!(node->buf->flags & IO_REGBUF_F_KBUF)) { 1015 ret = -EBUSY; 1016 goto unlock; 1017 } 1018 1019 io_put_rsrc_node(ctx, node); 1020 data->nodes[index] = NULL; 1021 unlock: 1022 io_ring_submit_unlock(ctx, issue_flags); 1023 return ret; 1024 } 1025 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 1026 1027 static int validate_fixed_range(u64 buf_addr, size_t len, 1028 const struct io_mapped_ubuf *imu) 1029 { 1030 u64 buf_end; 1031 1032 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1033 return -EFAULT; 1034 /* not inside the mapped region */ 1035 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1036 return -EFAULT; 1037 if (unlikely(len > MAX_RW_COUNT)) 1038 return -EFAULT; 1039 return 0; 1040 } 1041 1042 static int io_import_kbuf(int ddir, struct iov_iter *iter, 1043 struct io_mapped_ubuf *imu, size_t len, size_t offset) 1044 { 1045 size_t count = len + offset; 1046 1047 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); 1048 iov_iter_advance(iter, offset); 1049 return 0; 1050 } 1051 1052 static int io_import_fixed(int ddir, struct iov_iter *iter, 1053 struct io_mapped_ubuf *imu, 1054 u64 buf_addr, size_t len) 1055 { 1056 const struct bio_vec *bvec; 1057 size_t folio_mask; 1058 unsigned nr_segs; 1059 size_t offset; 1060 int ret; 1061 1062 ret = validate_fixed_range(buf_addr, len, imu); 1063 if (unlikely(ret)) 1064 return ret; 1065 if (!(imu->dir & (1 << ddir))) 1066 return -EFAULT; 1067 if (unlikely(!len)) { 1068 iov_iter_bvec(iter, ddir, NULL, 0, 0); 1069 return 0; 1070 } 1071 1072 offset = buf_addr - imu->ubuf; 1073 1074 if (imu->flags & IO_REGBUF_F_KBUF) 1075 return io_import_kbuf(ddir, iter, imu, len, offset); 1076 1077 /* 1078 * Don't use iov_iter_advance() here, as it's really slow for 1079 * using the latter parts of a big fixed buffer - it iterates 1080 * over each segment manually. We can cheat a bit here for user 1081 * registered nodes, because we know that: 1082 * 1083 * 1) it's a BVEC iter, we set it up 1084 * 2) all bvecs are the same in size, except potentially the 1085 * first and last bvec 1086 */ 1087 folio_mask = (1UL << imu->folio_shift) - 1; 1088 bvec = imu->bvec; 1089 if (offset >= bvec->bv_len) { 1090 unsigned long seg_skip; 1091 1092 /* skip first vec */ 1093 offset -= bvec->bv_len; 1094 seg_skip = 1 + (offset >> imu->folio_shift); 1095 bvec += seg_skip; 1096 offset &= folio_mask; 1097 } 1098 nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; 1099 iov_iter_bvec(iter, ddir, bvec, nr_segs, len); 1100 iter->iov_offset = offset; 1101 return 0; 1102 } 1103 1104 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 1105 unsigned issue_flags) 1106 { 1107 struct io_ring_ctx *ctx = req->ctx; 1108 struct io_rsrc_node *node; 1109 1110 if (req->flags & REQ_F_BUF_NODE) 1111 return req->buf_node; 1112 req->flags |= REQ_F_BUF_NODE; 1113 1114 io_ring_submit_lock(ctx, issue_flags); 1115 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 1116 if (node) { 1117 node->refs++; 1118 req->buf_node = node; 1119 io_ring_submit_unlock(ctx, issue_flags); 1120 return node; 1121 } 1122 req->flags &= ~REQ_F_BUF_NODE; 1123 io_ring_submit_unlock(ctx, issue_flags); 1124 return NULL; 1125 } 1126 1127 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 1128 u64 buf_addr, size_t len, int ddir, 1129 unsigned issue_flags) 1130 { 1131 struct io_rsrc_node *node; 1132 1133 node = io_find_buf_node(req, issue_flags); 1134 if (!node) 1135 return -EFAULT; 1136 return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1137 } 1138 1139 /* Lock two rings at once. The rings must be different! */ 1140 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 1141 { 1142 if (ctx1 > ctx2) 1143 swap(ctx1, ctx2); 1144 mutex_lock(&ctx1->uring_lock); 1145 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 1146 } 1147 1148 /* Both rings are locked by the caller. */ 1149 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 1150 struct io_uring_clone_buffers *arg) 1151 { 1152 struct io_rsrc_data data; 1153 int i, ret, off, nr; 1154 unsigned int nbufs; 1155 1156 lockdep_assert_held(&ctx->uring_lock); 1157 lockdep_assert_held(&src_ctx->uring_lock); 1158 1159 /* 1160 * Accounting state is shared between the two rings; that only works if 1161 * both rings are accounted towards the same counters. 1162 */ 1163 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 1164 return -EINVAL; 1165 1166 /* if offsets are given, must have nr specified too */ 1167 if (!arg->nr && (arg->dst_off || arg->src_off)) 1168 return -EINVAL; 1169 /* not allowed unless REPLACE is set */ 1170 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 1171 return -EBUSY; 1172 1173 nbufs = src_ctx->buf_table.nr; 1174 if (!nbufs) 1175 return -ENXIO; 1176 if (!arg->nr) 1177 arg->nr = nbufs; 1178 else if (arg->nr > nbufs) 1179 return -EINVAL; 1180 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1181 return -EINVAL; 1182 if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs) 1183 return -EOVERFLOW; 1184 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1185 return -EOVERFLOW; 1186 if (nbufs > IORING_MAX_REG_BUFFERS) 1187 return -EINVAL; 1188 1189 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 1190 if (ret) 1191 return ret; 1192 1193 /* Copy original dst nodes from before the cloned range */ 1194 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1195 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 1196 1197 if (node) { 1198 data.nodes[i] = node; 1199 node->refs++; 1200 } 1201 } 1202 1203 off = arg->dst_off; 1204 i = arg->src_off; 1205 nr = arg->nr; 1206 while (nr--) { 1207 struct io_rsrc_node *dst_node, *src_node; 1208 1209 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1210 if (!src_node) { 1211 dst_node = NULL; 1212 } else { 1213 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1214 if (!dst_node) { 1215 io_rsrc_data_free(ctx, &data); 1216 return -ENOMEM; 1217 } 1218 1219 refcount_inc(&src_node->buf->refs); 1220 dst_node->buf = src_node->buf; 1221 } 1222 data.nodes[off++] = dst_node; 1223 i++; 1224 } 1225 1226 /* Copy original dst nodes from after the cloned range */ 1227 for (i = nbufs; i < ctx->buf_table.nr; i++) { 1228 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 1229 1230 if (node) { 1231 data.nodes[i] = node; 1232 node->refs++; 1233 } 1234 } 1235 1236 /* 1237 * If asked for replace, put the old table. data->nodes[] holds both 1238 * old and new nodes at this point. 1239 */ 1240 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1241 io_rsrc_data_free(ctx, &ctx->buf_table); 1242 1243 /* 1244 * ctx->buf_table must be empty now - either the contents are being 1245 * replaced and we just freed the table, or the contents are being 1246 * copied to a ring that does not have buffers yet (checked at function 1247 * entry). 1248 */ 1249 WARN_ON_ONCE(ctx->buf_table.nr); 1250 ctx->buf_table = data; 1251 return 0; 1252 } 1253 1254 /* 1255 * Copy the registered buffers from the source ring whose file descriptor 1256 * is given in the src_fd to the current ring. This is identical to registering 1257 * the buffers with ctx, except faster as mappings already exist. 1258 * 1259 * Since the memory is already accounted once, don't account it again. 1260 */ 1261 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1262 { 1263 struct io_uring_clone_buffers buf; 1264 struct io_ring_ctx *src_ctx; 1265 bool registered_src; 1266 struct file *file; 1267 int ret; 1268 1269 if (copy_from_user(&buf, arg, sizeof(buf))) 1270 return -EFAULT; 1271 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1272 return -EINVAL; 1273 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1274 return -EBUSY; 1275 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1276 return -EINVAL; 1277 1278 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1279 file = io_uring_ctx_get_file(buf.src_fd, registered_src); 1280 if (IS_ERR(file)) 1281 return PTR_ERR(file); 1282 1283 src_ctx = file->private_data; 1284 if (src_ctx != ctx) { 1285 mutex_unlock(&ctx->uring_lock); 1286 lock_two_rings(ctx, src_ctx); 1287 1288 if (src_ctx->submitter_task && 1289 src_ctx->submitter_task != current) { 1290 ret = -EEXIST; 1291 goto out; 1292 } 1293 } 1294 1295 ret = io_clone_buffers(ctx, src_ctx, &buf); 1296 1297 out: 1298 if (src_ctx != ctx) 1299 mutex_unlock(&src_ctx->uring_lock); 1300 1301 if (!registered_src) 1302 fput(file); 1303 return ret; 1304 } 1305 1306 void io_vec_free(struct iou_vec *iv) 1307 { 1308 if (!iv->iovec) 1309 return; 1310 kfree(iv->iovec); 1311 iv->iovec = NULL; 1312 iv->nr = 0; 1313 } 1314 1315 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1316 { 1317 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_NOWARN; 1318 struct iovec *iov; 1319 1320 iov = kmalloc_objs(iov[0], nr_entries, gfp); 1321 if (!iov) 1322 return -ENOMEM; 1323 1324 io_vec_free(iv); 1325 iv->iovec = iov; 1326 iv->nr = nr_entries; 1327 return 0; 1328 } 1329 1330 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, 1331 struct io_mapped_ubuf *imu, 1332 struct iovec *iovec, unsigned nr_iovs, 1333 struct iou_vec *vec) 1334 { 1335 unsigned long folio_size = 1 << imu->folio_shift; 1336 unsigned long folio_mask = folio_size - 1; 1337 struct bio_vec *res_bvec = vec->bvec; 1338 size_t total_len = 0; 1339 unsigned bvec_idx = 0; 1340 unsigned iov_idx; 1341 1342 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1343 size_t iov_len = iovec[iov_idx].iov_len; 1344 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; 1345 struct bio_vec *src_bvec; 1346 size_t offset; 1347 int ret; 1348 1349 ret = validate_fixed_range(buf_addr, iov_len, imu); 1350 if (unlikely(ret)) 1351 return ret; 1352 1353 if (unlikely(!iov_len)) 1354 return -EFAULT; 1355 if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) 1356 return -EOVERFLOW; 1357 1358 offset = buf_addr - imu->ubuf; 1359 /* 1360 * Only the first bvec can have non zero bv_offset, account it 1361 * here and work with full folios below. 1362 */ 1363 offset += imu->bvec[0].bv_offset; 1364 1365 src_bvec = imu->bvec + (offset >> imu->folio_shift); 1366 offset &= folio_mask; 1367 1368 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { 1369 size_t seg_size = min_t(size_t, iov_len, 1370 folio_size - offset); 1371 1372 bvec_set_page(&res_bvec[bvec_idx], 1373 src_bvec->bv_page, seg_size, offset); 1374 iov_len -= seg_size; 1375 } 1376 } 1377 if (total_len > MAX_RW_COUNT) 1378 return -EINVAL; 1379 1380 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); 1381 return 0; 1382 } 1383 1384 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, 1385 struct io_mapped_ubuf *imu) 1386 { 1387 unsigned shift = imu->folio_shift; 1388 size_t max_segs = 0; 1389 unsigned i; 1390 1391 for (i = 0; i < nr_iovs; i++) { 1392 max_segs += (iov[i].iov_len >> shift) + 2; 1393 if (max_segs > INT_MAX) 1394 return -EOVERFLOW; 1395 } 1396 return max_segs; 1397 } 1398 1399 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, 1400 struct io_mapped_ubuf *imu, 1401 struct iovec *iovec, unsigned nr_iovs, 1402 struct iou_vec *vec) 1403 { 1404 const struct bio_vec *src_bvec = imu->bvec; 1405 struct bio_vec *res_bvec = vec->bvec; 1406 unsigned res_idx = 0; 1407 size_t total_len = 0; 1408 unsigned iov_idx; 1409 1410 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1411 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; 1412 size_t iov_len = iovec[iov_idx].iov_len; 1413 struct bvec_iter bi = { 1414 .bi_size = offset + iov_len, 1415 }; 1416 struct bio_vec bv; 1417 1418 bvec_iter_advance(src_bvec, &bi, offset); 1419 for_each_mp_bvec(bv, src_bvec, bi, bi) 1420 res_bvec[res_idx++] = bv; 1421 total_len += iov_len; 1422 } 1423 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); 1424 return 0; 1425 } 1426 1427 static int iov_kern_bvec_size(const struct iovec *iov, 1428 const struct io_mapped_ubuf *imu, 1429 unsigned int *nr_seg) 1430 { 1431 size_t offset = (size_t)(uintptr_t)iov->iov_base; 1432 const struct bio_vec *bvec = imu->bvec; 1433 int start = 0, i = 0; 1434 size_t off = 0; 1435 int ret; 1436 1437 ret = validate_fixed_range(offset, iov->iov_len, imu); 1438 if (unlikely(ret)) 1439 return ret; 1440 1441 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; 1442 off += bvec[i].bv_len, i++) { 1443 if (offset >= off && offset < off + bvec[i].bv_len) 1444 start = i; 1445 } 1446 *nr_seg = i - start; 1447 return 0; 1448 } 1449 1450 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, 1451 struct io_mapped_ubuf *imu, unsigned *nr_segs) 1452 { 1453 unsigned max_segs = 0; 1454 size_t total_len = 0; 1455 unsigned i; 1456 int ret; 1457 1458 *nr_segs = 0; 1459 for (i = 0; i < nr_iovs; i++) { 1460 if (unlikely(!iov[i].iov_len)) 1461 return -EFAULT; 1462 if (unlikely(check_add_overflow(total_len, iov[i].iov_len, 1463 &total_len))) 1464 return -EOVERFLOW; 1465 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); 1466 if (unlikely(ret)) 1467 return ret; 1468 *nr_segs += max_segs; 1469 } 1470 if (total_len > MAX_RW_COUNT) 1471 return -EINVAL; 1472 return 0; 1473 } 1474 1475 int io_import_reg_vec(int ddir, struct iov_iter *iter, 1476 struct io_kiocb *req, struct iou_vec *vec, 1477 unsigned nr_iovs, unsigned issue_flags) 1478 { 1479 struct io_rsrc_node *node; 1480 struct io_mapped_ubuf *imu; 1481 unsigned iovec_off; 1482 struct iovec *iov; 1483 unsigned nr_segs; 1484 1485 node = io_find_buf_node(req, issue_flags); 1486 if (!node) 1487 return -EFAULT; 1488 imu = node->buf; 1489 if (!(imu->dir & (1 << ddir))) 1490 return -EFAULT; 1491 1492 iovec_off = vec->nr - nr_iovs; 1493 iov = vec->iovec + iovec_off; 1494 1495 if (imu->flags & IO_REGBUF_F_KBUF) { 1496 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); 1497 1498 if (unlikely(ret)) 1499 return ret; 1500 } else { 1501 int ret = io_estimate_bvec_size(iov, nr_iovs, imu); 1502 1503 if (ret < 0) 1504 return ret; 1505 nr_segs = ret; 1506 } 1507 1508 if (sizeof(struct bio_vec) > sizeof(struct iovec)) { 1509 size_t bvec_bytes; 1510 1511 bvec_bytes = nr_segs * sizeof(struct bio_vec); 1512 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); 1513 nr_segs += nr_iovs; 1514 } 1515 1516 if (nr_segs > vec->nr) { 1517 struct iou_vec tmp_vec = {}; 1518 int ret; 1519 1520 ret = io_vec_realloc(&tmp_vec, nr_segs); 1521 if (ret) 1522 return ret; 1523 1524 iovec_off = tmp_vec.nr - nr_iovs; 1525 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); 1526 io_vec_free(vec); 1527 1528 *vec = tmp_vec; 1529 iov = vec->iovec + iovec_off; 1530 req->flags |= REQ_F_NEED_CLEANUP; 1531 } 1532 1533 if (imu->flags & IO_REGBUF_F_KBUF) 1534 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1535 1536 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1537 } 1538 1539 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, 1540 const struct iovec __user *uvec, size_t uvec_segs) 1541 { 1542 struct iovec *iov; 1543 int iovec_off, ret; 1544 void *res; 1545 1546 if (uvec_segs > iv->nr) { 1547 ret = io_vec_realloc(iv, uvec_segs); 1548 if (ret) 1549 return ret; 1550 req->flags |= REQ_F_NEED_CLEANUP; 1551 } 1552 1553 /* pad iovec to the right */ 1554 iovec_off = iv->nr - uvec_segs; 1555 iov = iv->iovec + iovec_off; 1556 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, 1557 io_is_compat(req->ctx)); 1558 if (IS_ERR(res)) 1559 return PTR_ERR(res); 1560 1561 req->flags |= REQ_F_IMPORT_BUFFER; 1562 return 0; 1563 } 1564