1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 #include <linux/io_uring/cmd.h> 13 14 #include <uapi/linux/io_uring.h> 15 16 #include "filetable.h" 17 #include "io_uring.h" 18 #include "openclose.h" 19 #include "rsrc.h" 20 #include "memmap.h" 21 #include "register.h" 22 23 struct io_rsrc_update { 24 struct file *file; 25 u64 arg; 26 u32 nr_args; 27 u32 offset; 28 }; 29 30 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 31 struct iovec *iov, struct page **last_hpage); 32 33 /* only define max */ 34 #define IORING_MAX_FIXED_FILES (1U << 20) 35 #define IORING_MAX_REG_BUFFERS (1U << 14) 36 37 #define IO_CACHED_BVECS_SEGS 32 38 39 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 40 { 41 unsigned long page_limit, cur_pages, new_pages; 42 43 if (!nr_pages) 44 return 0; 45 46 /* Don't allow more pages than we can safely lock */ 47 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 48 49 cur_pages = atomic_long_read(&user->locked_vm); 50 do { 51 new_pages = cur_pages + nr_pages; 52 if (new_pages > page_limit) 53 return -ENOMEM; 54 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 55 &cur_pages, new_pages)); 56 return 0; 57 } 58 59 void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 60 { 61 if (ctx->user) 62 __io_unaccount_mem(ctx->user, nr_pages); 63 64 if (ctx->mm_account) 65 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 66 } 67 68 int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 69 { 70 int ret; 71 72 if (ctx->user) { 73 ret = __io_account_mem(ctx->user, nr_pages); 74 if (ret) 75 return ret; 76 } 77 78 if (ctx->mm_account) 79 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 80 81 return 0; 82 } 83 84 int io_validate_user_buf_range(u64 uaddr, u64 ulen) 85 { 86 unsigned long tmp, base = (unsigned long)uaddr; 87 unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); 88 89 /* arbitrary limit, but we need something */ 90 if (ulen > SZ_1G || !ulen) 91 return -EFAULT; 92 if (check_add_overflow(base, acct_len, &tmp)) 93 return -EOVERFLOW; 94 return 0; 95 } 96 97 static int io_buffer_validate(struct iovec *iov) 98 { 99 /* 100 * Don't impose further limits on the size and buffer 101 * constraints here, we'll -EINVAL later when IO is 102 * submitted if they are wrong. 103 */ 104 if (!iov->iov_base) 105 return iov->iov_len ? -EFAULT : 0; 106 107 return io_validate_user_buf_range((unsigned long)iov->iov_base, 108 iov->iov_len); 109 } 110 111 static void io_release_ubuf(void *priv) 112 { 113 struct io_mapped_ubuf *imu = priv; 114 unsigned int i; 115 116 for (i = 0; i < imu->nr_bvecs; i++) { 117 struct folio *folio = page_folio(imu->bvec[i].bv_page); 118 119 unpin_user_folio(folio, 1); 120 } 121 } 122 123 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 124 int nr_bvecs) 125 { 126 if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 127 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 128 return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), 129 GFP_KERNEL); 130 } 131 132 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 133 { 134 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 135 io_cache_free(&ctx->imu_cache, imu); 136 else 137 kvfree(imu); 138 } 139 140 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 141 { 142 if (unlikely(refcount_read(&imu->refs) > 1)) { 143 if (!refcount_dec_and_test(&imu->refs)) 144 return; 145 } 146 147 if (imu->acct_pages) 148 io_unaccount_mem(ctx, imu->acct_pages); 149 imu->release(imu->priv); 150 io_free_imu(ctx, imu); 151 } 152 153 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 154 { 155 struct io_rsrc_node *node; 156 157 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 158 if (node) { 159 node->type = type; 160 node->refs = 1; 161 node->tag = 0; 162 node->file_ptr = 0; 163 } 164 return node; 165 } 166 167 bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 168 { 169 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 170 IO_CACHED_BVECS_SEGS); 171 const int node_size = sizeof(struct io_rsrc_node); 172 bool ret; 173 174 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 175 node_size, 0); 176 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 177 imu_cache_size, 0); 178 return ret; 179 } 180 181 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 182 { 183 io_alloc_cache_free(&ctx->node_cache, kfree); 184 io_alloc_cache_free(&ctx->imu_cache, kfree); 185 } 186 187 static void io_clear_table_tags(struct io_rsrc_data *data) 188 { 189 int i; 190 191 for (i = 0; i < data->nr; i++) { 192 struct io_rsrc_node *node = data->nodes[i]; 193 194 if (node) 195 node->tag = 0; 196 } 197 } 198 199 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 200 struct io_rsrc_data *data) 201 { 202 if (!data->nr) 203 return; 204 while (data->nr--) { 205 if (data->nodes[data->nr]) 206 io_put_rsrc_node(ctx, data->nodes[data->nr]); 207 } 208 kvfree(data->nodes); 209 data->nodes = NULL; 210 data->nr = 0; 211 } 212 213 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 214 { 215 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), 216 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 217 if (data->nodes) { 218 data->nr = nr; 219 return 0; 220 } 221 return -ENOMEM; 222 } 223 224 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 225 struct io_uring_rsrc_update2 *up, 226 unsigned nr_args) 227 { 228 u64 __user *tags = u64_to_user_ptr(up->tags); 229 __s32 __user *fds = u64_to_user_ptr(up->data); 230 int fd, i, err = 0; 231 unsigned int done; 232 233 if (!ctx->file_table.data.nr) 234 return -ENXIO; 235 if (up->offset + nr_args > ctx->file_table.data.nr) 236 return -EINVAL; 237 238 for (done = 0; done < nr_args; done++) { 239 u64 tag = 0; 240 241 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 242 copy_from_user(&fd, &fds[done], sizeof(fd))) { 243 err = -EFAULT; 244 break; 245 } 246 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 247 err = -EINVAL; 248 break; 249 } 250 if (fd == IORING_REGISTER_FILES_SKIP) 251 continue; 252 253 i = up->offset + done; 254 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 255 io_file_bitmap_clear(&ctx->file_table, i); 256 257 if (fd != -1) { 258 struct file *file = fget(fd); 259 struct io_rsrc_node *node; 260 261 if (!file) { 262 err = -EBADF; 263 break; 264 } 265 /* 266 * Don't allow io_uring instances to be registered. 267 */ 268 if (io_is_uring_fops(file)) { 269 fput(file); 270 err = -EBADF; 271 break; 272 } 273 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 274 if (!node) { 275 err = -ENOMEM; 276 fput(file); 277 break; 278 } 279 ctx->file_table.data.nodes[i] = node; 280 if (tag) 281 node->tag = tag; 282 io_fixed_file_set(node, file); 283 io_file_bitmap_set(&ctx->file_table, i); 284 } 285 } 286 return done ? done : err; 287 } 288 289 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 290 struct io_uring_rsrc_update2 *up, 291 unsigned int nr_args) 292 { 293 u64 __user *tags = u64_to_user_ptr(up->tags); 294 struct iovec fast_iov, *iov; 295 struct page *last_hpage = NULL; 296 struct iovec __user *uvec; 297 u64 user_data = up->data; 298 __u32 done; 299 int i, err; 300 301 if (!ctx->buf_table.nr) 302 return -ENXIO; 303 if (up->offset + nr_args > ctx->buf_table.nr) 304 return -EINVAL; 305 306 for (done = 0; done < nr_args; done++) { 307 struct io_rsrc_node *node; 308 u64 tag = 0; 309 310 uvec = u64_to_user_ptr(user_data); 311 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 312 if (IS_ERR(iov)) { 313 err = PTR_ERR(iov); 314 break; 315 } 316 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 317 err = -EFAULT; 318 break; 319 } 320 err = io_buffer_validate(iov); 321 if (err) 322 break; 323 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 324 if (IS_ERR(node)) { 325 err = PTR_ERR(node); 326 break; 327 } 328 if (tag) { 329 if (!node) { 330 err = -EINVAL; 331 break; 332 } 333 node->tag = tag; 334 } 335 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 336 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 337 ctx->buf_table.nodes[i] = node; 338 if (ctx->compat) 339 user_data += sizeof(struct compat_iovec); 340 else 341 user_data += sizeof(struct iovec); 342 } 343 return done ? done : err; 344 } 345 346 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 347 struct io_uring_rsrc_update2 *up, 348 unsigned nr_args) 349 { 350 __u32 tmp; 351 352 lockdep_assert_held(&ctx->uring_lock); 353 354 if (check_add_overflow(up->offset, nr_args, &tmp)) 355 return -EOVERFLOW; 356 357 switch (type) { 358 case IORING_RSRC_FILE: 359 return __io_sqe_files_update(ctx, up, nr_args); 360 case IORING_RSRC_BUFFER: 361 return __io_sqe_buffers_update(ctx, up, nr_args); 362 } 363 return -EINVAL; 364 } 365 366 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 367 unsigned nr_args) 368 { 369 struct io_uring_rsrc_update2 up; 370 371 if (!nr_args) 372 return -EINVAL; 373 memset(&up, 0, sizeof(up)); 374 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 375 return -EFAULT; 376 if (up.resv || up.resv2) 377 return -EINVAL; 378 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 379 } 380 381 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 382 unsigned size, unsigned type) 383 { 384 struct io_uring_rsrc_update2 up; 385 386 if (size != sizeof(up)) 387 return -EINVAL; 388 if (copy_from_user(&up, arg, sizeof(up))) 389 return -EFAULT; 390 if (!up.nr || up.resv || up.resv2) 391 return -EINVAL; 392 return __io_register_rsrc_update(ctx, type, &up, up.nr); 393 } 394 395 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 396 unsigned int size, unsigned int type) 397 { 398 struct io_uring_rsrc_register rr; 399 400 /* keep it extendible */ 401 if (size != sizeof(rr)) 402 return -EINVAL; 403 404 memset(&rr, 0, sizeof(rr)); 405 if (copy_from_user(&rr, arg, size)) 406 return -EFAULT; 407 if (!rr.nr || rr.resv2) 408 return -EINVAL; 409 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 410 return -EINVAL; 411 412 switch (type) { 413 case IORING_RSRC_FILE: 414 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 415 break; 416 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 417 rr.nr, u64_to_user_ptr(rr.tags)); 418 case IORING_RSRC_BUFFER: 419 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 420 break; 421 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 422 rr.nr, u64_to_user_ptr(rr.tags)); 423 } 424 return -EINVAL; 425 } 426 427 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 428 { 429 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 430 431 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 432 return -EINVAL; 433 if (sqe->rw_flags || sqe->splice_fd_in) 434 return -EINVAL; 435 436 up->offset = READ_ONCE(sqe->off); 437 up->nr_args = READ_ONCE(sqe->len); 438 if (!up->nr_args) 439 return -EINVAL; 440 up->arg = READ_ONCE(sqe->addr); 441 return 0; 442 } 443 444 static int io_files_update_with_index_alloc(struct io_kiocb *req, 445 unsigned int issue_flags) 446 { 447 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 448 __s32 __user *fds = u64_to_user_ptr(up->arg); 449 unsigned int done; 450 struct file *file; 451 int ret, fd; 452 453 if (!req->ctx->file_table.data.nr) 454 return -ENXIO; 455 456 for (done = 0; done < up->nr_args; done++) { 457 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 458 ret = -EFAULT; 459 break; 460 } 461 462 file = fget(fd); 463 if (!file) { 464 ret = -EBADF; 465 break; 466 } 467 ret = io_fixed_fd_install(req, issue_flags, file, 468 IORING_FILE_INDEX_ALLOC); 469 if (ret < 0) 470 break; 471 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 472 __io_close_fixed(req->ctx, issue_flags, ret); 473 ret = -EFAULT; 474 break; 475 } 476 } 477 478 if (done) 479 return done; 480 return ret; 481 } 482 483 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 484 { 485 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 486 struct io_ring_ctx *ctx = req->ctx; 487 struct io_uring_rsrc_update2 up2; 488 int ret; 489 490 up2.offset = up->offset; 491 up2.data = up->arg; 492 up2.nr = 0; 493 up2.tags = 0; 494 up2.resv = 0; 495 up2.resv2 = 0; 496 497 if (up->offset == IORING_FILE_INDEX_ALLOC) { 498 ret = io_files_update_with_index_alloc(req, issue_flags); 499 } else { 500 io_ring_submit_lock(ctx, issue_flags); 501 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 502 &up2, up->nr_args); 503 io_ring_submit_unlock(ctx, issue_flags); 504 } 505 506 if (ret < 0) 507 req_set_fail(req); 508 io_req_set_res(req, ret, 0); 509 return IOU_COMPLETE; 510 } 511 512 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 513 { 514 if (node->tag) 515 io_post_aux_cqe(ctx, node->tag, 0, 0); 516 517 switch (node->type) { 518 case IORING_RSRC_FILE: 519 fput(io_slot_file(node)); 520 break; 521 case IORING_RSRC_BUFFER: 522 io_buffer_unmap(ctx, node->buf); 523 break; 524 default: 525 WARN_ON_ONCE(1); 526 break; 527 } 528 529 io_cache_free(&ctx->node_cache, node); 530 } 531 532 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 533 { 534 if (!ctx->file_table.data.nr) 535 return -ENXIO; 536 537 io_free_file_tables(ctx, &ctx->file_table); 538 io_file_table_set_alloc_range(ctx, 0, 0); 539 return 0; 540 } 541 542 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 543 unsigned nr_args, u64 __user *tags) 544 { 545 __s32 __user *fds = (__s32 __user *) arg; 546 struct file *file; 547 int fd, ret; 548 unsigned i; 549 550 if (ctx->file_table.data.nr) 551 return -EBUSY; 552 if (!nr_args) 553 return -EINVAL; 554 if (nr_args > IORING_MAX_FIXED_FILES) 555 return -EMFILE; 556 if (nr_args > rlimit(RLIMIT_NOFILE)) 557 return -EMFILE; 558 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 559 return -ENOMEM; 560 561 for (i = 0; i < nr_args; i++) { 562 struct io_rsrc_node *node; 563 u64 tag = 0; 564 565 ret = -EFAULT; 566 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 567 goto fail; 568 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 569 goto fail; 570 /* allow sparse sets */ 571 if (!fds || fd == -1) { 572 ret = -EINVAL; 573 if (tag) 574 goto fail; 575 continue; 576 } 577 578 file = fget(fd); 579 ret = -EBADF; 580 if (unlikely(!file)) 581 goto fail; 582 583 /* 584 * Don't allow io_uring instances to be registered. 585 */ 586 if (io_is_uring_fops(file)) { 587 fput(file); 588 goto fail; 589 } 590 ret = -ENOMEM; 591 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 592 if (!node) { 593 fput(file); 594 goto fail; 595 } 596 if (tag) 597 node->tag = tag; 598 ctx->file_table.data.nodes[i] = node; 599 io_fixed_file_set(node, file); 600 io_file_bitmap_set(&ctx->file_table, i); 601 } 602 603 /* default it to the whole table */ 604 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 605 return 0; 606 fail: 607 io_clear_table_tags(&ctx->file_table.data); 608 io_sqe_files_unregister(ctx); 609 return ret; 610 } 611 612 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 613 { 614 if (!ctx->buf_table.nr) 615 return -ENXIO; 616 io_rsrc_data_free(ctx, &ctx->buf_table); 617 return 0; 618 } 619 620 /* 621 * Not super efficient, but this is just a registration time. And we do cache 622 * the last compound head, so generally we'll only do a full search if we don't 623 * match that one. 624 * 625 * We check if the given compound head page has already been accounted, to 626 * avoid double accounting it. This allows us to account the full size of the 627 * page, not just the constituent pages of a huge page. 628 */ 629 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 630 int nr_pages, struct page *hpage) 631 { 632 int i, j; 633 634 /* check current page array */ 635 for (i = 0; i < nr_pages; i++) { 636 if (!PageCompound(pages[i])) 637 continue; 638 if (compound_head(pages[i]) == hpage) 639 return true; 640 } 641 642 /* check previously registered pages */ 643 for (i = 0; i < ctx->buf_table.nr; i++) { 644 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 645 struct io_mapped_ubuf *imu; 646 647 if (!node) 648 continue; 649 imu = node->buf; 650 for (j = 0; j < imu->nr_bvecs; j++) { 651 if (!PageCompound(imu->bvec[j].bv_page)) 652 continue; 653 if (compound_head(imu->bvec[j].bv_page) == hpage) 654 return true; 655 } 656 } 657 658 return false; 659 } 660 661 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 662 int nr_pages, struct io_mapped_ubuf *imu, 663 struct page **last_hpage) 664 { 665 int i, ret; 666 667 imu->acct_pages = 0; 668 for (i = 0; i < nr_pages; i++) { 669 if (!PageCompound(pages[i])) { 670 imu->acct_pages++; 671 } else { 672 struct page *hpage; 673 674 hpage = compound_head(pages[i]); 675 if (hpage == *last_hpage) 676 continue; 677 *last_hpage = hpage; 678 if (headpage_already_acct(ctx, pages, i, hpage)) 679 continue; 680 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 681 } 682 } 683 684 if (!imu->acct_pages) 685 return 0; 686 687 ret = io_account_mem(ctx, imu->acct_pages); 688 if (ret) 689 imu->acct_pages = 0; 690 return ret; 691 } 692 693 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 694 struct io_imu_folio_data *data) 695 { 696 struct page **page_array = *pages, **new_array = NULL; 697 unsigned nr_pages_left = *nr_pages; 698 unsigned nr_folios = data->nr_folios; 699 unsigned i, j; 700 701 /* Store head pages only*/ 702 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); 703 if (!new_array) 704 return false; 705 706 for (i = 0, j = 0; i < nr_folios; i++) { 707 struct page *p = compound_head(page_array[j]); 708 struct folio *folio = page_folio(p); 709 unsigned int nr; 710 711 WARN_ON_ONCE(i > 0 && p != page_array[j]); 712 713 nr = i ? data->nr_pages_mid : data->nr_pages_head; 714 nr = min(nr, nr_pages_left); 715 /* Drop all but one ref, the entire folio will remain pinned. */ 716 if (nr > 1) 717 unpin_user_folio(folio, nr - 1); 718 j += nr; 719 nr_pages_left -= nr; 720 new_array[i] = p; 721 } 722 723 WARN_ON_ONCE(j != *nr_pages); 724 725 kvfree(page_array); 726 *pages = new_array; 727 *nr_pages = nr_folios; 728 return true; 729 } 730 731 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 732 struct io_imu_folio_data *data) 733 { 734 struct folio *folio = page_folio(page_array[0]); 735 unsigned int count = 1, nr_folios = 1; 736 int i; 737 738 data->nr_pages_mid = folio_nr_pages(folio); 739 data->folio_shift = folio_shift(folio); 740 data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); 741 742 /* 743 * Check if pages are contiguous inside a folio, and all folios have 744 * the same page count except for the head and tail. 745 */ 746 for (i = 1; i < nr_pages; i++) { 747 if (page_folio(page_array[i]) == folio && 748 page_array[i] == page_array[i-1] + 1) { 749 count++; 750 continue; 751 } 752 753 if (nr_folios == 1) { 754 if (folio_page_idx(folio, page_array[i-1]) != 755 data->nr_pages_mid - 1) 756 return false; 757 758 data->nr_pages_head = count; 759 } else if (count != data->nr_pages_mid) { 760 return false; 761 } 762 763 folio = page_folio(page_array[i]); 764 if (folio_size(folio) != (1UL << data->folio_shift) || 765 folio_page_idx(folio, page_array[i]) != 0) 766 return false; 767 768 count = 1; 769 nr_folios++; 770 } 771 if (nr_folios == 1) 772 data->nr_pages_head = count; 773 774 data->nr_folios = nr_folios; 775 return true; 776 } 777 778 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 779 struct iovec *iov, 780 struct page **last_hpage) 781 { 782 struct io_mapped_ubuf *imu = NULL; 783 struct page **pages = NULL; 784 struct io_rsrc_node *node; 785 unsigned long off; 786 size_t size; 787 int ret, nr_pages, i; 788 struct io_imu_folio_data data; 789 bool coalesced = false; 790 791 if (!iov->iov_base) 792 return NULL; 793 794 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 795 if (!node) 796 return ERR_PTR(-ENOMEM); 797 798 ret = -ENOMEM; 799 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 800 &nr_pages); 801 if (IS_ERR(pages)) { 802 ret = PTR_ERR(pages); 803 pages = NULL; 804 goto done; 805 } 806 807 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 808 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 809 if (data.nr_pages_mid != 1) 810 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 811 } 812 813 imu = io_alloc_imu(ctx, nr_pages); 814 if (!imu) 815 goto done; 816 817 imu->nr_bvecs = nr_pages; 818 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 819 if (ret) 820 goto done; 821 822 size = iov->iov_len; 823 /* store original address for later verification */ 824 imu->ubuf = (unsigned long) iov->iov_base; 825 imu->len = iov->iov_len; 826 imu->folio_shift = PAGE_SHIFT; 827 imu->release = io_release_ubuf; 828 imu->priv = imu; 829 imu->is_kbuf = false; 830 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 831 if (coalesced) 832 imu->folio_shift = data.folio_shift; 833 refcount_set(&imu->refs, 1); 834 835 off = (unsigned long)iov->iov_base & ~PAGE_MASK; 836 if (coalesced) 837 off += data.first_folio_page_idx << PAGE_SHIFT; 838 839 node->buf = imu; 840 ret = 0; 841 842 for (i = 0; i < nr_pages; i++) { 843 size_t vec_len; 844 845 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 846 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 847 off = 0; 848 size -= vec_len; 849 } 850 done: 851 if (ret) { 852 if (imu) 853 io_free_imu(ctx, imu); 854 if (pages) { 855 for (i = 0; i < nr_pages; i++) 856 unpin_user_folio(page_folio(pages[i]), 1); 857 } 858 io_cache_free(&ctx->node_cache, node); 859 node = ERR_PTR(ret); 860 } 861 kvfree(pages); 862 return node; 863 } 864 865 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 866 unsigned int nr_args, u64 __user *tags) 867 { 868 struct page *last_hpage = NULL; 869 struct io_rsrc_data data; 870 struct iovec fast_iov, *iov = &fast_iov; 871 const struct iovec __user *uvec; 872 int i, ret; 873 874 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 875 876 if (ctx->buf_table.nr) 877 return -EBUSY; 878 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 879 return -EINVAL; 880 ret = io_rsrc_data_alloc(&data, nr_args); 881 if (ret) 882 return ret; 883 884 if (!arg) 885 memset(iov, 0, sizeof(*iov)); 886 887 for (i = 0; i < nr_args; i++) { 888 struct io_rsrc_node *node; 889 u64 tag = 0; 890 891 if (arg) { 892 uvec = (struct iovec __user *) arg; 893 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 894 if (IS_ERR(iov)) { 895 ret = PTR_ERR(iov); 896 break; 897 } 898 ret = io_buffer_validate(iov); 899 if (ret) 900 break; 901 if (ctx->compat) 902 arg += sizeof(struct compat_iovec); 903 else 904 arg += sizeof(struct iovec); 905 } 906 907 if (tags) { 908 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 909 ret = -EFAULT; 910 break; 911 } 912 } 913 914 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 915 if (IS_ERR(node)) { 916 ret = PTR_ERR(node); 917 break; 918 } 919 if (tag) { 920 if (!node) { 921 ret = -EINVAL; 922 break; 923 } 924 node->tag = tag; 925 } 926 data.nodes[i] = node; 927 } 928 929 ctx->buf_table = data; 930 if (ret) { 931 io_clear_table_tags(&ctx->buf_table); 932 io_sqe_buffers_unregister(ctx); 933 } 934 return ret; 935 } 936 937 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 938 void (*release)(void *), unsigned int index, 939 unsigned int issue_flags) 940 { 941 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 942 struct io_rsrc_data *data = &ctx->buf_table; 943 struct req_iterator rq_iter; 944 struct io_mapped_ubuf *imu; 945 struct io_rsrc_node *node; 946 struct bio_vec bv; 947 unsigned int nr_bvecs = 0; 948 int ret = 0; 949 950 io_ring_submit_lock(ctx, issue_flags); 951 if (index >= data->nr) { 952 ret = -EINVAL; 953 goto unlock; 954 } 955 index = array_index_nospec(index, data->nr); 956 957 if (data->nodes[index]) { 958 ret = -EBUSY; 959 goto unlock; 960 } 961 962 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 963 if (!node) { 964 ret = -ENOMEM; 965 goto unlock; 966 } 967 968 /* 969 * blk_rq_nr_phys_segments() may overestimate the number of bvecs 970 * but avoids needing to iterate over the bvecs 971 */ 972 imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); 973 if (!imu) { 974 kfree(node); 975 ret = -ENOMEM; 976 goto unlock; 977 } 978 979 imu->ubuf = 0; 980 imu->len = blk_rq_bytes(rq); 981 imu->acct_pages = 0; 982 imu->folio_shift = PAGE_SHIFT; 983 refcount_set(&imu->refs, 1); 984 imu->release = release; 985 imu->priv = rq; 986 imu->is_kbuf = true; 987 imu->dir = 1 << rq_data_dir(rq); 988 989 rq_for_each_bvec(bv, rq, rq_iter) 990 imu->bvec[nr_bvecs++] = bv; 991 imu->nr_bvecs = nr_bvecs; 992 993 node->buf = imu; 994 data->nodes[index] = node; 995 unlock: 996 io_ring_submit_unlock(ctx, issue_flags); 997 return ret; 998 } 999 EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 1000 1001 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 1002 unsigned int issue_flags) 1003 { 1004 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 1005 struct io_rsrc_data *data = &ctx->buf_table; 1006 struct io_rsrc_node *node; 1007 int ret = 0; 1008 1009 io_ring_submit_lock(ctx, issue_flags); 1010 if (index >= data->nr) { 1011 ret = -EINVAL; 1012 goto unlock; 1013 } 1014 index = array_index_nospec(index, data->nr); 1015 1016 node = data->nodes[index]; 1017 if (!node) { 1018 ret = -EINVAL; 1019 goto unlock; 1020 } 1021 if (!node->buf->is_kbuf) { 1022 ret = -EBUSY; 1023 goto unlock; 1024 } 1025 1026 io_put_rsrc_node(ctx, node); 1027 data->nodes[index] = NULL; 1028 unlock: 1029 io_ring_submit_unlock(ctx, issue_flags); 1030 return ret; 1031 } 1032 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 1033 1034 static int validate_fixed_range(u64 buf_addr, size_t len, 1035 const struct io_mapped_ubuf *imu) 1036 { 1037 u64 buf_end; 1038 1039 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1040 return -EFAULT; 1041 /* not inside the mapped region */ 1042 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1043 return -EFAULT; 1044 if (unlikely(len > MAX_RW_COUNT)) 1045 return -EFAULT; 1046 return 0; 1047 } 1048 1049 static int io_import_kbuf(int ddir, struct iov_iter *iter, 1050 struct io_mapped_ubuf *imu, size_t len, size_t offset) 1051 { 1052 size_t count = len + offset; 1053 1054 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); 1055 iov_iter_advance(iter, offset); 1056 1057 if (count < imu->len) { 1058 const struct bio_vec *bvec = iter->bvec; 1059 1060 while (len > bvec->bv_len) { 1061 len -= bvec->bv_len; 1062 bvec++; 1063 } 1064 iter->nr_segs = 1 + bvec - iter->bvec; 1065 } 1066 return 0; 1067 } 1068 1069 static int io_import_fixed(int ddir, struct iov_iter *iter, 1070 struct io_mapped_ubuf *imu, 1071 u64 buf_addr, size_t len) 1072 { 1073 const struct bio_vec *bvec; 1074 size_t folio_mask; 1075 unsigned nr_segs; 1076 size_t offset; 1077 int ret; 1078 1079 ret = validate_fixed_range(buf_addr, len, imu); 1080 if (unlikely(ret)) 1081 return ret; 1082 if (!(imu->dir & (1 << ddir))) 1083 return -EFAULT; 1084 1085 offset = buf_addr - imu->ubuf; 1086 1087 if (imu->is_kbuf) 1088 return io_import_kbuf(ddir, iter, imu, len, offset); 1089 1090 /* 1091 * Don't use iov_iter_advance() here, as it's really slow for 1092 * using the latter parts of a big fixed buffer - it iterates 1093 * over each segment manually. We can cheat a bit here for user 1094 * registered nodes, because we know that: 1095 * 1096 * 1) it's a BVEC iter, we set it up 1097 * 2) all bvecs are the same in size, except potentially the 1098 * first and last bvec 1099 */ 1100 folio_mask = (1UL << imu->folio_shift) - 1; 1101 bvec = imu->bvec; 1102 if (offset >= bvec->bv_len) { 1103 unsigned long seg_skip; 1104 1105 /* skip first vec */ 1106 offset -= bvec->bv_len; 1107 seg_skip = 1 + (offset >> imu->folio_shift); 1108 bvec += seg_skip; 1109 offset &= folio_mask; 1110 } 1111 nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; 1112 iov_iter_bvec(iter, ddir, bvec, nr_segs, len); 1113 iter->iov_offset = offset; 1114 return 0; 1115 } 1116 1117 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 1118 unsigned issue_flags) 1119 { 1120 struct io_ring_ctx *ctx = req->ctx; 1121 struct io_rsrc_node *node; 1122 1123 if (req->flags & REQ_F_BUF_NODE) 1124 return req->buf_node; 1125 req->flags |= REQ_F_BUF_NODE; 1126 1127 io_ring_submit_lock(ctx, issue_flags); 1128 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 1129 if (node) { 1130 node->refs++; 1131 req->buf_node = node; 1132 io_ring_submit_unlock(ctx, issue_flags); 1133 return node; 1134 } 1135 req->flags &= ~REQ_F_BUF_NODE; 1136 io_ring_submit_unlock(ctx, issue_flags); 1137 return NULL; 1138 } 1139 1140 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 1141 u64 buf_addr, size_t len, int ddir, 1142 unsigned issue_flags) 1143 { 1144 struct io_rsrc_node *node; 1145 1146 node = io_find_buf_node(req, issue_flags); 1147 if (!node) 1148 return -EFAULT; 1149 return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1150 } 1151 1152 /* Lock two rings at once. The rings must be different! */ 1153 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 1154 { 1155 if (ctx1 > ctx2) 1156 swap(ctx1, ctx2); 1157 mutex_lock(&ctx1->uring_lock); 1158 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 1159 } 1160 1161 /* Both rings are locked by the caller. */ 1162 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 1163 struct io_uring_clone_buffers *arg) 1164 { 1165 struct io_rsrc_data data; 1166 int i, ret, off, nr; 1167 unsigned int nbufs; 1168 1169 lockdep_assert_held(&ctx->uring_lock); 1170 lockdep_assert_held(&src_ctx->uring_lock); 1171 1172 /* 1173 * Accounting state is shared between the two rings; that only works if 1174 * both rings are accounted towards the same counters. 1175 */ 1176 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 1177 return -EINVAL; 1178 1179 /* if offsets are given, must have nr specified too */ 1180 if (!arg->nr && (arg->dst_off || arg->src_off)) 1181 return -EINVAL; 1182 /* not allowed unless REPLACE is set */ 1183 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 1184 return -EBUSY; 1185 1186 nbufs = src_ctx->buf_table.nr; 1187 if (!arg->nr) 1188 arg->nr = nbufs; 1189 else if (arg->nr > nbufs) 1190 return -EINVAL; 1191 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1192 return -EINVAL; 1193 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1194 return -EOVERFLOW; 1195 if (nbufs > IORING_MAX_REG_BUFFERS) 1196 return -EINVAL; 1197 1198 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 1199 if (ret) 1200 return ret; 1201 1202 /* Fill entries in data from dst that won't overlap with src */ 1203 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1204 struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; 1205 1206 if (src_node) { 1207 data.nodes[i] = src_node; 1208 src_node->refs++; 1209 } 1210 } 1211 1212 ret = -ENXIO; 1213 nbufs = src_ctx->buf_table.nr; 1214 if (!nbufs) 1215 goto out_free; 1216 ret = -EINVAL; 1217 if (!arg->nr) 1218 arg->nr = nbufs; 1219 else if (arg->nr > nbufs) 1220 goto out_free; 1221 ret = -EOVERFLOW; 1222 if (check_add_overflow(arg->nr, arg->src_off, &off)) 1223 goto out_free; 1224 if (off > nbufs) 1225 goto out_free; 1226 1227 off = arg->dst_off; 1228 i = arg->src_off; 1229 nr = arg->nr; 1230 while (nr--) { 1231 struct io_rsrc_node *dst_node, *src_node; 1232 1233 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1234 if (!src_node) { 1235 dst_node = NULL; 1236 } else { 1237 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1238 if (!dst_node) { 1239 ret = -ENOMEM; 1240 goto out_free; 1241 } 1242 1243 refcount_inc(&src_node->buf->refs); 1244 dst_node->buf = src_node->buf; 1245 } 1246 data.nodes[off++] = dst_node; 1247 i++; 1248 } 1249 1250 /* 1251 * If asked for replace, put the old table. data->nodes[] holds both 1252 * old and new nodes at this point. 1253 */ 1254 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1255 io_rsrc_data_free(ctx, &ctx->buf_table); 1256 1257 /* 1258 * ctx->buf_table must be empty now - either the contents are being 1259 * replaced and we just freed the table, or the contents are being 1260 * copied to a ring that does not have buffers yet (checked at function 1261 * entry). 1262 */ 1263 WARN_ON_ONCE(ctx->buf_table.nr); 1264 ctx->buf_table = data; 1265 return 0; 1266 1267 out_free: 1268 io_rsrc_data_free(ctx, &data); 1269 return ret; 1270 } 1271 1272 /* 1273 * Copy the registered buffers from the source ring whose file descriptor 1274 * is given in the src_fd to the current ring. This is identical to registering 1275 * the buffers with ctx, except faster as mappings already exist. 1276 * 1277 * Since the memory is already accounted once, don't account it again. 1278 */ 1279 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1280 { 1281 struct io_uring_clone_buffers buf; 1282 struct io_ring_ctx *src_ctx; 1283 bool registered_src; 1284 struct file *file; 1285 int ret; 1286 1287 if (copy_from_user(&buf, arg, sizeof(buf))) 1288 return -EFAULT; 1289 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1290 return -EINVAL; 1291 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1292 return -EBUSY; 1293 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1294 return -EINVAL; 1295 1296 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1297 file = io_uring_register_get_file(buf.src_fd, registered_src); 1298 if (IS_ERR(file)) 1299 return PTR_ERR(file); 1300 1301 src_ctx = file->private_data; 1302 if (src_ctx != ctx) { 1303 mutex_unlock(&ctx->uring_lock); 1304 lock_two_rings(ctx, src_ctx); 1305 1306 if (src_ctx->submitter_task && 1307 src_ctx->submitter_task != current) { 1308 ret = -EEXIST; 1309 goto out; 1310 } 1311 } 1312 1313 ret = io_clone_buffers(ctx, src_ctx, &buf); 1314 1315 out: 1316 if (src_ctx != ctx) 1317 mutex_unlock(&src_ctx->uring_lock); 1318 1319 fput(file); 1320 return ret; 1321 } 1322 1323 void io_vec_free(struct iou_vec *iv) 1324 { 1325 if (!iv->iovec) 1326 return; 1327 kfree(iv->iovec); 1328 iv->iovec = NULL; 1329 iv->nr = 0; 1330 } 1331 1332 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1333 { 1334 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1335 struct iovec *iov; 1336 1337 iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); 1338 if (!iov) 1339 return -ENOMEM; 1340 1341 io_vec_free(iv); 1342 iv->iovec = iov; 1343 iv->nr = nr_entries; 1344 return 0; 1345 } 1346 1347 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, 1348 struct io_mapped_ubuf *imu, 1349 struct iovec *iovec, unsigned nr_iovs, 1350 struct iou_vec *vec) 1351 { 1352 unsigned long folio_size = 1 << imu->folio_shift; 1353 unsigned long folio_mask = folio_size - 1; 1354 struct bio_vec *res_bvec = vec->bvec; 1355 size_t total_len = 0; 1356 unsigned bvec_idx = 0; 1357 unsigned iov_idx; 1358 1359 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1360 size_t iov_len = iovec[iov_idx].iov_len; 1361 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; 1362 struct bio_vec *src_bvec; 1363 size_t offset; 1364 int ret; 1365 1366 ret = validate_fixed_range(buf_addr, iov_len, imu); 1367 if (unlikely(ret)) 1368 return ret; 1369 1370 if (unlikely(!iov_len)) 1371 return -EFAULT; 1372 if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) 1373 return -EOVERFLOW; 1374 1375 offset = buf_addr - imu->ubuf; 1376 /* 1377 * Only the first bvec can have non zero bv_offset, account it 1378 * here and work with full folios below. 1379 */ 1380 offset += imu->bvec[0].bv_offset; 1381 1382 src_bvec = imu->bvec + (offset >> imu->folio_shift); 1383 offset &= folio_mask; 1384 1385 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { 1386 size_t seg_size = min_t(size_t, iov_len, 1387 folio_size - offset); 1388 1389 bvec_set_page(&res_bvec[bvec_idx], 1390 src_bvec->bv_page, seg_size, offset); 1391 iov_len -= seg_size; 1392 } 1393 } 1394 if (total_len > MAX_RW_COUNT) 1395 return -EINVAL; 1396 1397 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); 1398 return 0; 1399 } 1400 1401 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, 1402 struct io_mapped_ubuf *imu) 1403 { 1404 unsigned shift = imu->folio_shift; 1405 size_t max_segs = 0; 1406 unsigned i; 1407 1408 for (i = 0; i < nr_iovs; i++) { 1409 max_segs += (iov[i].iov_len >> shift) + 2; 1410 if (max_segs > INT_MAX) 1411 return -EOVERFLOW; 1412 } 1413 return max_segs; 1414 } 1415 1416 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, 1417 struct io_mapped_ubuf *imu, 1418 struct iovec *iovec, unsigned nr_iovs, 1419 struct iou_vec *vec) 1420 { 1421 const struct bio_vec *src_bvec = imu->bvec; 1422 struct bio_vec *res_bvec = vec->bvec; 1423 unsigned res_idx = 0; 1424 size_t total_len = 0; 1425 unsigned iov_idx; 1426 1427 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1428 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; 1429 size_t iov_len = iovec[iov_idx].iov_len; 1430 struct bvec_iter bi = { 1431 .bi_size = offset + iov_len, 1432 }; 1433 struct bio_vec bv; 1434 1435 bvec_iter_advance(src_bvec, &bi, offset); 1436 for_each_mp_bvec(bv, src_bvec, bi, bi) 1437 res_bvec[res_idx++] = bv; 1438 total_len += iov_len; 1439 } 1440 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); 1441 return 0; 1442 } 1443 1444 static int iov_kern_bvec_size(const struct iovec *iov, 1445 const struct io_mapped_ubuf *imu, 1446 unsigned int *nr_seg) 1447 { 1448 size_t offset = (size_t)(uintptr_t)iov->iov_base; 1449 const struct bio_vec *bvec = imu->bvec; 1450 int start = 0, i = 0; 1451 size_t off = 0; 1452 int ret; 1453 1454 ret = validate_fixed_range(offset, iov->iov_len, imu); 1455 if (unlikely(ret)) 1456 return ret; 1457 1458 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; 1459 off += bvec[i].bv_len, i++) { 1460 if (offset >= off && offset < off + bvec[i].bv_len) 1461 start = i; 1462 } 1463 *nr_seg = i - start; 1464 return 0; 1465 } 1466 1467 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, 1468 struct io_mapped_ubuf *imu, unsigned *nr_segs) 1469 { 1470 unsigned max_segs = 0; 1471 size_t total_len = 0; 1472 unsigned i; 1473 int ret; 1474 1475 *nr_segs = 0; 1476 for (i = 0; i < nr_iovs; i++) { 1477 if (unlikely(!iov[i].iov_len)) 1478 return -EFAULT; 1479 if (unlikely(check_add_overflow(total_len, iov[i].iov_len, 1480 &total_len))) 1481 return -EOVERFLOW; 1482 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); 1483 if (unlikely(ret)) 1484 return ret; 1485 *nr_segs += max_segs; 1486 } 1487 if (total_len > MAX_RW_COUNT) 1488 return -EINVAL; 1489 return 0; 1490 } 1491 1492 int io_import_reg_vec(int ddir, struct iov_iter *iter, 1493 struct io_kiocb *req, struct iou_vec *vec, 1494 unsigned nr_iovs, unsigned issue_flags) 1495 { 1496 struct io_rsrc_node *node; 1497 struct io_mapped_ubuf *imu; 1498 unsigned iovec_off; 1499 struct iovec *iov; 1500 unsigned nr_segs; 1501 1502 node = io_find_buf_node(req, issue_flags); 1503 if (!node) 1504 return -EFAULT; 1505 imu = node->buf; 1506 if (!(imu->dir & (1 << ddir))) 1507 return -EFAULT; 1508 1509 iovec_off = vec->nr - nr_iovs; 1510 iov = vec->iovec + iovec_off; 1511 1512 if (imu->is_kbuf) { 1513 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); 1514 1515 if (unlikely(ret)) 1516 return ret; 1517 } else { 1518 int ret = io_estimate_bvec_size(iov, nr_iovs, imu); 1519 1520 if (ret < 0) 1521 return ret; 1522 nr_segs = ret; 1523 } 1524 1525 if (sizeof(struct bio_vec) > sizeof(struct iovec)) { 1526 size_t bvec_bytes; 1527 1528 bvec_bytes = nr_segs * sizeof(struct bio_vec); 1529 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); 1530 nr_segs += nr_iovs; 1531 } 1532 1533 if (nr_segs > vec->nr) { 1534 struct iou_vec tmp_vec = {}; 1535 int ret; 1536 1537 ret = io_vec_realloc(&tmp_vec, nr_segs); 1538 if (ret) 1539 return ret; 1540 1541 iovec_off = tmp_vec.nr - nr_iovs; 1542 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); 1543 io_vec_free(vec); 1544 1545 *vec = tmp_vec; 1546 iov = vec->iovec + iovec_off; 1547 req->flags |= REQ_F_NEED_CLEANUP; 1548 } 1549 1550 if (imu->is_kbuf) 1551 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1552 1553 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1554 } 1555 1556 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, 1557 const struct iovec __user *uvec, size_t uvec_segs) 1558 { 1559 struct iovec *iov; 1560 int iovec_off, ret; 1561 void *res; 1562 1563 if (uvec_segs > iv->nr) { 1564 ret = io_vec_realloc(iv, uvec_segs); 1565 if (ret) 1566 return ret; 1567 req->flags |= REQ_F_NEED_CLEANUP; 1568 } 1569 1570 /* pad iovec to the right */ 1571 iovec_off = iv->nr - uvec_segs; 1572 iov = iv->iovec + iovec_off; 1573 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, 1574 io_is_compat(req->ctx)); 1575 if (IS_ERR(res)) 1576 return PTR_ERR(res); 1577 1578 req->flags |= REQ_F_IMPORT_BUFFER; 1579 return 0; 1580 } 1581