1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 #include <linux/io_uring/cmd.h> 13 14 #include <uapi/linux/io_uring.h> 15 16 #include "filetable.h" 17 #include "io_uring.h" 18 #include "openclose.h" 19 #include "rsrc.h" 20 #include "memmap.h" 21 #include "register.h" 22 23 struct io_rsrc_update { 24 struct file *file; 25 u64 arg; 26 u32 nr_args; 27 u32 offset; 28 }; 29 30 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 31 struct iovec *iov, struct page **last_hpage); 32 33 /* only define max */ 34 #define IORING_MAX_FIXED_FILES (1U << 20) 35 #define IORING_MAX_REG_BUFFERS (1U << 14) 36 37 #define IO_CACHED_BVECS_SEGS 32 38 39 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 40 { 41 unsigned long page_limit, cur_pages, new_pages; 42 43 if (!nr_pages) 44 return 0; 45 46 /* Don't allow more pages than we can safely lock */ 47 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 48 49 cur_pages = atomic_long_read(&user->locked_vm); 50 do { 51 new_pages = cur_pages + nr_pages; 52 if (new_pages > page_limit) 53 return -ENOMEM; 54 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 55 &cur_pages, new_pages)); 56 return 0; 57 } 58 59 void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 60 { 61 if (ctx->user) 62 __io_unaccount_mem(ctx->user, nr_pages); 63 64 if (ctx->mm_account) 65 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 66 } 67 68 int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 69 { 70 int ret; 71 72 if (ctx->user) { 73 ret = __io_account_mem(ctx->user, nr_pages); 74 if (ret) 75 return ret; 76 } 77 78 if (ctx->mm_account) 79 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 80 81 return 0; 82 } 83 84 int io_validate_user_buf_range(u64 uaddr, u64 ulen) 85 { 86 unsigned long tmp, base = (unsigned long)uaddr; 87 unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); 88 89 /* arbitrary limit, but we need something */ 90 if (ulen > SZ_1G || !ulen) 91 return -EFAULT; 92 if (check_add_overflow(base, acct_len, &tmp)) 93 return -EOVERFLOW; 94 return 0; 95 } 96 97 static int io_buffer_validate(struct iovec *iov) 98 { 99 /* 100 * Don't impose further limits on the size and buffer 101 * constraints here, we'll -EINVAL later when IO is 102 * submitted if they are wrong. 103 */ 104 if (!iov->iov_base) 105 return iov->iov_len ? -EFAULT : 0; 106 107 return io_validate_user_buf_range((unsigned long)iov->iov_base, 108 iov->iov_len); 109 } 110 111 static void io_release_ubuf(void *priv) 112 { 113 struct io_mapped_ubuf *imu = priv; 114 unsigned int i; 115 116 for (i = 0; i < imu->nr_bvecs; i++) { 117 struct folio *folio = page_folio(imu->bvec[i].bv_page); 118 119 unpin_user_folio(folio, 1); 120 } 121 } 122 123 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 124 int nr_bvecs) 125 { 126 if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 127 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 128 return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), 129 GFP_KERNEL); 130 } 131 132 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 133 { 134 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 135 io_cache_free(&ctx->imu_cache, imu); 136 else 137 kvfree(imu); 138 } 139 140 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 141 { 142 if (unlikely(refcount_read(&imu->refs) > 1)) { 143 if (!refcount_dec_and_test(&imu->refs)) 144 return; 145 } 146 147 if (imu->acct_pages) 148 io_unaccount_mem(ctx, imu->acct_pages); 149 imu->release(imu->priv); 150 io_free_imu(ctx, imu); 151 } 152 153 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 154 { 155 struct io_rsrc_node *node; 156 157 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 158 if (node) { 159 node->type = type; 160 node->refs = 1; 161 node->tag = 0; 162 node->file_ptr = 0; 163 } 164 return node; 165 } 166 167 bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 168 { 169 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 170 IO_CACHED_BVECS_SEGS); 171 const int node_size = sizeof(struct io_rsrc_node); 172 bool ret; 173 174 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 175 node_size, 0); 176 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 177 imu_cache_size, 0); 178 return ret; 179 } 180 181 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 182 { 183 io_alloc_cache_free(&ctx->node_cache, kfree); 184 io_alloc_cache_free(&ctx->imu_cache, kfree); 185 } 186 187 static void io_clear_table_tags(struct io_rsrc_data *data) 188 { 189 int i; 190 191 for (i = 0; i < data->nr; i++) { 192 struct io_rsrc_node *node = data->nodes[i]; 193 194 if (node) 195 node->tag = 0; 196 } 197 } 198 199 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 200 struct io_rsrc_data *data) 201 { 202 if (!data->nr) 203 return; 204 while (data->nr--) { 205 if (data->nodes[data->nr]) 206 io_put_rsrc_node(ctx, data->nodes[data->nr]); 207 } 208 kvfree(data->nodes); 209 data->nodes = NULL; 210 data->nr = 0; 211 } 212 213 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 214 { 215 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), 216 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 217 if (data->nodes) { 218 data->nr = nr; 219 return 0; 220 } 221 return -ENOMEM; 222 } 223 224 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 225 struct io_uring_rsrc_update2 *up, 226 unsigned nr_args) 227 { 228 u64 __user *tags = u64_to_user_ptr(up->tags); 229 __s32 __user *fds = u64_to_user_ptr(up->data); 230 int fd, i, err = 0; 231 unsigned int done; 232 233 if (!ctx->file_table.data.nr) 234 return -ENXIO; 235 if (up->offset + nr_args > ctx->file_table.data.nr) 236 return -EINVAL; 237 238 for (done = 0; done < nr_args; done++) { 239 u64 tag = 0; 240 241 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 242 copy_from_user(&fd, &fds[done], sizeof(fd))) { 243 err = -EFAULT; 244 break; 245 } 246 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 247 err = -EINVAL; 248 break; 249 } 250 if (fd == IORING_REGISTER_FILES_SKIP) 251 continue; 252 253 i = up->offset + done; 254 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 255 io_file_bitmap_clear(&ctx->file_table, i); 256 257 if (fd != -1) { 258 struct file *file = fget(fd); 259 struct io_rsrc_node *node; 260 261 if (!file) { 262 err = -EBADF; 263 break; 264 } 265 /* 266 * Don't allow io_uring instances to be registered. 267 */ 268 if (io_is_uring_fops(file)) { 269 fput(file); 270 err = -EBADF; 271 break; 272 } 273 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 274 if (!node) { 275 err = -ENOMEM; 276 fput(file); 277 break; 278 } 279 ctx->file_table.data.nodes[i] = node; 280 if (tag) 281 node->tag = tag; 282 io_fixed_file_set(node, file); 283 io_file_bitmap_set(&ctx->file_table, i); 284 } 285 } 286 return done ? done : err; 287 } 288 289 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 290 struct io_uring_rsrc_update2 *up, 291 unsigned int nr_args) 292 { 293 u64 __user *tags = u64_to_user_ptr(up->tags); 294 struct iovec fast_iov, *iov; 295 struct page *last_hpage = NULL; 296 struct iovec __user *uvec; 297 u64 user_data = up->data; 298 __u32 done; 299 int i, err; 300 301 if (!ctx->buf_table.nr) 302 return -ENXIO; 303 if (up->offset + nr_args > ctx->buf_table.nr) 304 return -EINVAL; 305 306 for (done = 0; done < nr_args; done++) { 307 struct io_rsrc_node *node; 308 u64 tag = 0; 309 310 uvec = u64_to_user_ptr(user_data); 311 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 312 if (IS_ERR(iov)) { 313 err = PTR_ERR(iov); 314 break; 315 } 316 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 317 err = -EFAULT; 318 break; 319 } 320 err = io_buffer_validate(iov); 321 if (err) 322 break; 323 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 324 if (IS_ERR(node)) { 325 err = PTR_ERR(node); 326 break; 327 } 328 if (tag) { 329 if (!node) { 330 err = -EINVAL; 331 break; 332 } 333 node->tag = tag; 334 } 335 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 336 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 337 ctx->buf_table.nodes[i] = node; 338 if (ctx->compat) 339 user_data += sizeof(struct compat_iovec); 340 else 341 user_data += sizeof(struct iovec); 342 } 343 return done ? done : err; 344 } 345 346 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 347 struct io_uring_rsrc_update2 *up, 348 unsigned nr_args) 349 { 350 __u32 tmp; 351 352 lockdep_assert_held(&ctx->uring_lock); 353 354 if (check_add_overflow(up->offset, nr_args, &tmp)) 355 return -EOVERFLOW; 356 357 switch (type) { 358 case IORING_RSRC_FILE: 359 return __io_sqe_files_update(ctx, up, nr_args); 360 case IORING_RSRC_BUFFER: 361 return __io_sqe_buffers_update(ctx, up, nr_args); 362 } 363 return -EINVAL; 364 } 365 366 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 367 unsigned nr_args) 368 { 369 struct io_uring_rsrc_update2 up; 370 371 if (!nr_args) 372 return -EINVAL; 373 memset(&up, 0, sizeof(up)); 374 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 375 return -EFAULT; 376 if (up.resv || up.resv2) 377 return -EINVAL; 378 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 379 } 380 381 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 382 unsigned size, unsigned type) 383 { 384 struct io_uring_rsrc_update2 up; 385 386 if (size != sizeof(up)) 387 return -EINVAL; 388 if (copy_from_user(&up, arg, sizeof(up))) 389 return -EFAULT; 390 if (!up.nr || up.resv || up.resv2) 391 return -EINVAL; 392 return __io_register_rsrc_update(ctx, type, &up, up.nr); 393 } 394 395 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 396 unsigned int size, unsigned int type) 397 { 398 struct io_uring_rsrc_register rr; 399 400 /* keep it extendible */ 401 if (size != sizeof(rr)) 402 return -EINVAL; 403 404 memset(&rr, 0, sizeof(rr)); 405 if (copy_from_user(&rr, arg, size)) 406 return -EFAULT; 407 if (!rr.nr || rr.resv2) 408 return -EINVAL; 409 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 410 return -EINVAL; 411 412 switch (type) { 413 case IORING_RSRC_FILE: 414 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 415 break; 416 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 417 rr.nr, u64_to_user_ptr(rr.tags)); 418 case IORING_RSRC_BUFFER: 419 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 420 break; 421 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 422 rr.nr, u64_to_user_ptr(rr.tags)); 423 } 424 return -EINVAL; 425 } 426 427 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 428 { 429 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 430 431 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 432 return -EINVAL; 433 if (sqe->rw_flags || sqe->splice_fd_in) 434 return -EINVAL; 435 436 up->offset = READ_ONCE(sqe->off); 437 up->nr_args = READ_ONCE(sqe->len); 438 if (!up->nr_args) 439 return -EINVAL; 440 up->arg = READ_ONCE(sqe->addr); 441 return 0; 442 } 443 444 static int io_files_update_with_index_alloc(struct io_kiocb *req, 445 unsigned int issue_flags) 446 { 447 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 448 __s32 __user *fds = u64_to_user_ptr(up->arg); 449 unsigned int done; 450 struct file *file; 451 int ret, fd; 452 453 if (!req->ctx->file_table.data.nr) 454 return -ENXIO; 455 456 for (done = 0; done < up->nr_args; done++) { 457 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 458 ret = -EFAULT; 459 break; 460 } 461 462 file = fget(fd); 463 if (!file) { 464 ret = -EBADF; 465 break; 466 } 467 ret = io_fixed_fd_install(req, issue_flags, file, 468 IORING_FILE_INDEX_ALLOC); 469 if (ret < 0) 470 break; 471 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 472 __io_close_fixed(req->ctx, issue_flags, ret); 473 ret = -EFAULT; 474 break; 475 } 476 } 477 478 if (done) 479 return done; 480 return ret; 481 } 482 483 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 484 { 485 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 486 struct io_ring_ctx *ctx = req->ctx; 487 struct io_uring_rsrc_update2 up2; 488 int ret; 489 490 up2.offset = up->offset; 491 up2.data = up->arg; 492 up2.nr = 0; 493 up2.tags = 0; 494 up2.resv = 0; 495 up2.resv2 = 0; 496 497 if (up->offset == IORING_FILE_INDEX_ALLOC) { 498 ret = io_files_update_with_index_alloc(req, issue_flags); 499 } else { 500 io_ring_submit_lock(ctx, issue_flags); 501 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 502 &up2, up->nr_args); 503 io_ring_submit_unlock(ctx, issue_flags); 504 } 505 506 if (ret < 0) 507 req_set_fail(req); 508 io_req_set_res(req, ret, 0); 509 return IOU_COMPLETE; 510 } 511 512 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 513 { 514 if (node->tag) 515 io_post_aux_cqe(ctx, node->tag, 0, 0); 516 517 switch (node->type) { 518 case IORING_RSRC_FILE: 519 fput(io_slot_file(node)); 520 break; 521 case IORING_RSRC_BUFFER: 522 io_buffer_unmap(ctx, node->buf); 523 break; 524 default: 525 WARN_ON_ONCE(1); 526 break; 527 } 528 529 io_cache_free(&ctx->node_cache, node); 530 } 531 532 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 533 { 534 if (!ctx->file_table.data.nr) 535 return -ENXIO; 536 537 io_free_file_tables(ctx, &ctx->file_table); 538 io_file_table_set_alloc_range(ctx, 0, 0); 539 return 0; 540 } 541 542 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 543 unsigned nr_args, u64 __user *tags) 544 { 545 __s32 __user *fds = (__s32 __user *) arg; 546 struct file *file; 547 int fd, ret; 548 unsigned i; 549 550 if (ctx->file_table.data.nr) 551 return -EBUSY; 552 if (!nr_args) 553 return -EINVAL; 554 if (nr_args > IORING_MAX_FIXED_FILES) 555 return -EMFILE; 556 if (nr_args > rlimit(RLIMIT_NOFILE)) 557 return -EMFILE; 558 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 559 return -ENOMEM; 560 561 for (i = 0; i < nr_args; i++) { 562 struct io_rsrc_node *node; 563 u64 tag = 0; 564 565 ret = -EFAULT; 566 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 567 goto fail; 568 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 569 goto fail; 570 /* allow sparse sets */ 571 if (!fds || fd == -1) { 572 ret = -EINVAL; 573 if (tag) 574 goto fail; 575 continue; 576 } 577 578 file = fget(fd); 579 ret = -EBADF; 580 if (unlikely(!file)) 581 goto fail; 582 583 /* 584 * Don't allow io_uring instances to be registered. 585 */ 586 if (io_is_uring_fops(file)) { 587 fput(file); 588 goto fail; 589 } 590 ret = -ENOMEM; 591 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 592 if (!node) { 593 fput(file); 594 goto fail; 595 } 596 if (tag) 597 node->tag = tag; 598 ctx->file_table.data.nodes[i] = node; 599 io_fixed_file_set(node, file); 600 io_file_bitmap_set(&ctx->file_table, i); 601 } 602 603 /* default it to the whole table */ 604 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 605 return 0; 606 fail: 607 io_clear_table_tags(&ctx->file_table.data); 608 io_sqe_files_unregister(ctx); 609 return ret; 610 } 611 612 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 613 { 614 if (!ctx->buf_table.nr) 615 return -ENXIO; 616 io_rsrc_data_free(ctx, &ctx->buf_table); 617 return 0; 618 } 619 620 /* 621 * Not super efficient, but this is just a registration time. And we do cache 622 * the last compound head, so generally we'll only do a full search if we don't 623 * match that one. 624 * 625 * We check if the given compound head page has already been accounted, to 626 * avoid double accounting it. This allows us to account the full size of the 627 * page, not just the constituent pages of a huge page. 628 */ 629 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 630 int nr_pages, struct page *hpage) 631 { 632 int i, j; 633 634 /* check current page array */ 635 for (i = 0; i < nr_pages; i++) { 636 if (!PageCompound(pages[i])) 637 continue; 638 if (compound_head(pages[i]) == hpage) 639 return true; 640 } 641 642 /* check previously registered pages */ 643 for (i = 0; i < ctx->buf_table.nr; i++) { 644 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 645 struct io_mapped_ubuf *imu; 646 647 if (!node) 648 continue; 649 imu = node->buf; 650 for (j = 0; j < imu->nr_bvecs; j++) { 651 if (!PageCompound(imu->bvec[j].bv_page)) 652 continue; 653 if (compound_head(imu->bvec[j].bv_page) == hpage) 654 return true; 655 } 656 } 657 658 return false; 659 } 660 661 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 662 int nr_pages, struct io_mapped_ubuf *imu, 663 struct page **last_hpage) 664 { 665 int i, ret; 666 667 imu->acct_pages = 0; 668 for (i = 0; i < nr_pages; i++) { 669 if (!PageCompound(pages[i])) { 670 imu->acct_pages++; 671 } else { 672 struct page *hpage; 673 674 hpage = compound_head(pages[i]); 675 if (hpage == *last_hpage) 676 continue; 677 *last_hpage = hpage; 678 if (headpage_already_acct(ctx, pages, i, hpage)) 679 continue; 680 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 681 } 682 } 683 684 if (!imu->acct_pages) 685 return 0; 686 687 ret = io_account_mem(ctx, imu->acct_pages); 688 if (ret) 689 imu->acct_pages = 0; 690 return ret; 691 } 692 693 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 694 struct io_imu_folio_data *data) 695 { 696 struct page **page_array = *pages, **new_array = NULL; 697 unsigned nr_pages_left = *nr_pages; 698 unsigned nr_folios = data->nr_folios; 699 unsigned i, j; 700 701 /* Store head pages only*/ 702 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); 703 if (!new_array) 704 return false; 705 706 for (i = 0, j = 0; i < nr_folios; i++) { 707 struct page *p = compound_head(page_array[j]); 708 struct folio *folio = page_folio(p); 709 unsigned int nr; 710 711 WARN_ON_ONCE(i > 0 && p != page_array[j]); 712 713 nr = i ? data->nr_pages_mid : data->nr_pages_head; 714 nr = min(nr, nr_pages_left); 715 /* Drop all but one ref, the entire folio will remain pinned. */ 716 if (nr > 1) 717 unpin_user_folio(folio, nr - 1); 718 j += nr; 719 nr_pages_left -= nr; 720 new_array[i] = p; 721 } 722 723 WARN_ON_ONCE(j != *nr_pages); 724 725 kvfree(page_array); 726 *pages = new_array; 727 *nr_pages = nr_folios; 728 return true; 729 } 730 731 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 732 struct io_imu_folio_data *data) 733 { 734 struct folio *folio = page_folio(page_array[0]); 735 unsigned int count = 1, nr_folios = 1; 736 int i; 737 738 data->nr_pages_mid = folio_nr_pages(folio); 739 data->folio_shift = folio_shift(folio); 740 data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); 741 742 /* 743 * Check if pages are contiguous inside a folio, and all folios have 744 * the same page count except for the head and tail. 745 */ 746 for (i = 1; i < nr_pages; i++) { 747 if (page_folio(page_array[i]) == folio && 748 page_array[i] == page_array[i-1] + 1) { 749 count++; 750 continue; 751 } 752 753 if (nr_folios == 1) { 754 if (folio_page_idx(folio, page_array[i-1]) != 755 data->nr_pages_mid - 1) 756 return false; 757 758 data->nr_pages_head = count; 759 } else if (count != data->nr_pages_mid) { 760 return false; 761 } 762 763 folio = page_folio(page_array[i]); 764 if (folio_size(folio) != (1UL << data->folio_shift) || 765 folio_page_idx(folio, page_array[i]) != 0) 766 return false; 767 768 count = 1; 769 nr_folios++; 770 } 771 if (nr_folios == 1) 772 data->nr_pages_head = count; 773 774 data->nr_folios = nr_folios; 775 return true; 776 } 777 778 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 779 struct iovec *iov, 780 struct page **last_hpage) 781 { 782 struct io_mapped_ubuf *imu = NULL; 783 struct page **pages = NULL; 784 struct io_rsrc_node *node; 785 unsigned long off; 786 size_t size; 787 int ret, nr_pages, i; 788 struct io_imu_folio_data data; 789 bool coalesced = false; 790 791 if (!iov->iov_base) 792 return NULL; 793 794 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 795 if (!node) 796 return ERR_PTR(-ENOMEM); 797 798 ret = -ENOMEM; 799 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 800 &nr_pages); 801 if (IS_ERR(pages)) { 802 ret = PTR_ERR(pages); 803 pages = NULL; 804 goto done; 805 } 806 807 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 808 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 809 if (data.nr_pages_mid != 1) 810 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 811 } 812 813 imu = io_alloc_imu(ctx, nr_pages); 814 if (!imu) 815 goto done; 816 817 imu->nr_bvecs = nr_pages; 818 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 819 if (ret) 820 goto done; 821 822 size = iov->iov_len; 823 /* store original address for later verification */ 824 imu->ubuf = (unsigned long) iov->iov_base; 825 imu->len = iov->iov_len; 826 imu->folio_shift = PAGE_SHIFT; 827 imu->release = io_release_ubuf; 828 imu->priv = imu; 829 imu->is_kbuf = false; 830 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 831 if (coalesced) 832 imu->folio_shift = data.folio_shift; 833 refcount_set(&imu->refs, 1); 834 835 off = (unsigned long)iov->iov_base & ~PAGE_MASK; 836 if (coalesced) 837 off += data.first_folio_page_idx << PAGE_SHIFT; 838 839 node->buf = imu; 840 ret = 0; 841 842 for (i = 0; i < nr_pages; i++) { 843 size_t vec_len; 844 845 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 846 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 847 off = 0; 848 size -= vec_len; 849 } 850 done: 851 if (ret) { 852 if (imu) 853 io_free_imu(ctx, imu); 854 if (pages) { 855 for (i = 0; i < nr_pages; i++) 856 unpin_user_folio(page_folio(pages[i]), 1); 857 } 858 io_cache_free(&ctx->node_cache, node); 859 node = ERR_PTR(ret); 860 } 861 kvfree(pages); 862 return node; 863 } 864 865 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 866 unsigned int nr_args, u64 __user *tags) 867 { 868 struct page *last_hpage = NULL; 869 struct io_rsrc_data data; 870 struct iovec fast_iov, *iov = &fast_iov; 871 const struct iovec __user *uvec; 872 int i, ret; 873 874 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 875 876 if (ctx->buf_table.nr) 877 return -EBUSY; 878 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 879 return -EINVAL; 880 ret = io_rsrc_data_alloc(&data, nr_args); 881 if (ret) 882 return ret; 883 884 if (!arg) 885 memset(iov, 0, sizeof(*iov)); 886 887 for (i = 0; i < nr_args; i++) { 888 struct io_rsrc_node *node; 889 u64 tag = 0; 890 891 if (arg) { 892 uvec = (struct iovec __user *) arg; 893 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 894 if (IS_ERR(iov)) { 895 ret = PTR_ERR(iov); 896 break; 897 } 898 ret = io_buffer_validate(iov); 899 if (ret) 900 break; 901 if (ctx->compat) 902 arg += sizeof(struct compat_iovec); 903 else 904 arg += sizeof(struct iovec); 905 } 906 907 if (tags) { 908 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 909 ret = -EFAULT; 910 break; 911 } 912 } 913 914 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 915 if (IS_ERR(node)) { 916 ret = PTR_ERR(node); 917 break; 918 } 919 if (tag) { 920 if (!node) { 921 ret = -EINVAL; 922 break; 923 } 924 node->tag = tag; 925 } 926 data.nodes[i] = node; 927 } 928 929 ctx->buf_table = data; 930 if (ret) { 931 io_clear_table_tags(&ctx->buf_table); 932 io_sqe_buffers_unregister(ctx); 933 } 934 return ret; 935 } 936 937 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 938 void (*release)(void *), unsigned int index, 939 unsigned int issue_flags) 940 { 941 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 942 struct io_rsrc_data *data = &ctx->buf_table; 943 struct req_iterator rq_iter; 944 struct io_mapped_ubuf *imu; 945 struct io_rsrc_node *node; 946 struct bio_vec bv, *bvec; 947 u16 nr_bvecs; 948 int ret = 0; 949 950 io_ring_submit_lock(ctx, issue_flags); 951 if (index >= data->nr) { 952 ret = -EINVAL; 953 goto unlock; 954 } 955 index = array_index_nospec(index, data->nr); 956 957 if (data->nodes[index]) { 958 ret = -EBUSY; 959 goto unlock; 960 } 961 962 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 963 if (!node) { 964 ret = -ENOMEM; 965 goto unlock; 966 } 967 968 nr_bvecs = blk_rq_nr_phys_segments(rq); 969 imu = io_alloc_imu(ctx, nr_bvecs); 970 if (!imu) { 971 kfree(node); 972 ret = -ENOMEM; 973 goto unlock; 974 } 975 976 imu->ubuf = 0; 977 imu->len = blk_rq_bytes(rq); 978 imu->acct_pages = 0; 979 imu->folio_shift = PAGE_SHIFT; 980 imu->nr_bvecs = nr_bvecs; 981 refcount_set(&imu->refs, 1); 982 imu->release = release; 983 imu->priv = rq; 984 imu->is_kbuf = true; 985 imu->dir = 1 << rq_data_dir(rq); 986 987 bvec = imu->bvec; 988 rq_for_each_bvec(bv, rq, rq_iter) 989 *bvec++ = bv; 990 991 node->buf = imu; 992 data->nodes[index] = node; 993 unlock: 994 io_ring_submit_unlock(ctx, issue_flags); 995 return ret; 996 } 997 EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 998 999 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 1000 unsigned int issue_flags) 1001 { 1002 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 1003 struct io_rsrc_data *data = &ctx->buf_table; 1004 struct io_rsrc_node *node; 1005 int ret = 0; 1006 1007 io_ring_submit_lock(ctx, issue_flags); 1008 if (index >= data->nr) { 1009 ret = -EINVAL; 1010 goto unlock; 1011 } 1012 index = array_index_nospec(index, data->nr); 1013 1014 node = data->nodes[index]; 1015 if (!node) { 1016 ret = -EINVAL; 1017 goto unlock; 1018 } 1019 if (!node->buf->is_kbuf) { 1020 ret = -EBUSY; 1021 goto unlock; 1022 } 1023 1024 io_put_rsrc_node(ctx, node); 1025 data->nodes[index] = NULL; 1026 unlock: 1027 io_ring_submit_unlock(ctx, issue_flags); 1028 return ret; 1029 } 1030 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 1031 1032 static int validate_fixed_range(u64 buf_addr, size_t len, 1033 const struct io_mapped_ubuf *imu) 1034 { 1035 u64 buf_end; 1036 1037 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1038 return -EFAULT; 1039 /* not inside the mapped region */ 1040 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1041 return -EFAULT; 1042 if (unlikely(len > MAX_RW_COUNT)) 1043 return -EFAULT; 1044 return 0; 1045 } 1046 1047 static int io_import_kbuf(int ddir, struct iov_iter *iter, 1048 struct io_mapped_ubuf *imu, size_t len, size_t offset) 1049 { 1050 size_t count = len + offset; 1051 1052 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); 1053 iov_iter_advance(iter, offset); 1054 1055 if (count < imu->len) { 1056 const struct bio_vec *bvec = iter->bvec; 1057 1058 while (len > bvec->bv_len) { 1059 len -= bvec->bv_len; 1060 bvec++; 1061 } 1062 iter->nr_segs = 1 + bvec - iter->bvec; 1063 } 1064 return 0; 1065 } 1066 1067 static int io_import_fixed(int ddir, struct iov_iter *iter, 1068 struct io_mapped_ubuf *imu, 1069 u64 buf_addr, size_t len) 1070 { 1071 const struct bio_vec *bvec; 1072 size_t folio_mask; 1073 unsigned nr_segs; 1074 size_t offset; 1075 int ret; 1076 1077 ret = validate_fixed_range(buf_addr, len, imu); 1078 if (unlikely(ret)) 1079 return ret; 1080 if (!(imu->dir & (1 << ddir))) 1081 return -EFAULT; 1082 1083 offset = buf_addr - imu->ubuf; 1084 1085 if (imu->is_kbuf) 1086 return io_import_kbuf(ddir, iter, imu, len, offset); 1087 1088 /* 1089 * Don't use iov_iter_advance() here, as it's really slow for 1090 * using the latter parts of a big fixed buffer - it iterates 1091 * over each segment manually. We can cheat a bit here for user 1092 * registered nodes, because we know that: 1093 * 1094 * 1) it's a BVEC iter, we set it up 1095 * 2) all bvecs are the same in size, except potentially the 1096 * first and last bvec 1097 */ 1098 folio_mask = (1UL << imu->folio_shift) - 1; 1099 bvec = imu->bvec; 1100 if (offset >= bvec->bv_len) { 1101 unsigned long seg_skip; 1102 1103 /* skip first vec */ 1104 offset -= bvec->bv_len; 1105 seg_skip = 1 + (offset >> imu->folio_shift); 1106 bvec += seg_skip; 1107 offset &= folio_mask; 1108 } 1109 nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; 1110 iov_iter_bvec(iter, ddir, bvec, nr_segs, len); 1111 iter->iov_offset = offset; 1112 return 0; 1113 } 1114 1115 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 1116 unsigned issue_flags) 1117 { 1118 struct io_ring_ctx *ctx = req->ctx; 1119 struct io_rsrc_node *node; 1120 1121 if (req->flags & REQ_F_BUF_NODE) 1122 return req->buf_node; 1123 req->flags |= REQ_F_BUF_NODE; 1124 1125 io_ring_submit_lock(ctx, issue_flags); 1126 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 1127 if (node) { 1128 node->refs++; 1129 req->buf_node = node; 1130 io_ring_submit_unlock(ctx, issue_flags); 1131 return node; 1132 } 1133 req->flags &= ~REQ_F_BUF_NODE; 1134 io_ring_submit_unlock(ctx, issue_flags); 1135 return NULL; 1136 } 1137 1138 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 1139 u64 buf_addr, size_t len, int ddir, 1140 unsigned issue_flags) 1141 { 1142 struct io_rsrc_node *node; 1143 1144 node = io_find_buf_node(req, issue_flags); 1145 if (!node) 1146 return -EFAULT; 1147 return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1148 } 1149 1150 /* Lock two rings at once. The rings must be different! */ 1151 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 1152 { 1153 if (ctx1 > ctx2) 1154 swap(ctx1, ctx2); 1155 mutex_lock(&ctx1->uring_lock); 1156 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 1157 } 1158 1159 /* Both rings are locked by the caller. */ 1160 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 1161 struct io_uring_clone_buffers *arg) 1162 { 1163 struct io_rsrc_data data; 1164 int i, ret, off, nr; 1165 unsigned int nbufs; 1166 1167 lockdep_assert_held(&ctx->uring_lock); 1168 lockdep_assert_held(&src_ctx->uring_lock); 1169 1170 /* 1171 * Accounting state is shared between the two rings; that only works if 1172 * both rings are accounted towards the same counters. 1173 */ 1174 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 1175 return -EINVAL; 1176 1177 /* if offsets are given, must have nr specified too */ 1178 if (!arg->nr && (arg->dst_off || arg->src_off)) 1179 return -EINVAL; 1180 /* not allowed unless REPLACE is set */ 1181 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 1182 return -EBUSY; 1183 1184 nbufs = src_ctx->buf_table.nr; 1185 if (!arg->nr) 1186 arg->nr = nbufs; 1187 else if (arg->nr > nbufs) 1188 return -EINVAL; 1189 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1190 return -EINVAL; 1191 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1192 return -EOVERFLOW; 1193 if (nbufs > IORING_MAX_REG_BUFFERS) 1194 return -EINVAL; 1195 1196 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 1197 if (ret) 1198 return ret; 1199 1200 /* Fill entries in data from dst that won't overlap with src */ 1201 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1202 struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; 1203 1204 if (src_node) { 1205 data.nodes[i] = src_node; 1206 src_node->refs++; 1207 } 1208 } 1209 1210 ret = -ENXIO; 1211 nbufs = src_ctx->buf_table.nr; 1212 if (!nbufs) 1213 goto out_free; 1214 ret = -EINVAL; 1215 if (!arg->nr) 1216 arg->nr = nbufs; 1217 else if (arg->nr > nbufs) 1218 goto out_free; 1219 ret = -EOVERFLOW; 1220 if (check_add_overflow(arg->nr, arg->src_off, &off)) 1221 goto out_free; 1222 if (off > nbufs) 1223 goto out_free; 1224 1225 off = arg->dst_off; 1226 i = arg->src_off; 1227 nr = arg->nr; 1228 while (nr--) { 1229 struct io_rsrc_node *dst_node, *src_node; 1230 1231 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1232 if (!src_node) { 1233 dst_node = NULL; 1234 } else { 1235 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1236 if (!dst_node) { 1237 ret = -ENOMEM; 1238 goto out_free; 1239 } 1240 1241 refcount_inc(&src_node->buf->refs); 1242 dst_node->buf = src_node->buf; 1243 } 1244 data.nodes[off++] = dst_node; 1245 i++; 1246 } 1247 1248 /* 1249 * If asked for replace, put the old table. data->nodes[] holds both 1250 * old and new nodes at this point. 1251 */ 1252 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1253 io_rsrc_data_free(ctx, &ctx->buf_table); 1254 1255 /* 1256 * ctx->buf_table must be empty now - either the contents are being 1257 * replaced and we just freed the table, or the contents are being 1258 * copied to a ring that does not have buffers yet (checked at function 1259 * entry). 1260 */ 1261 WARN_ON_ONCE(ctx->buf_table.nr); 1262 ctx->buf_table = data; 1263 return 0; 1264 1265 out_free: 1266 io_rsrc_data_free(ctx, &data); 1267 return ret; 1268 } 1269 1270 /* 1271 * Copy the registered buffers from the source ring whose file descriptor 1272 * is given in the src_fd to the current ring. This is identical to registering 1273 * the buffers with ctx, except faster as mappings already exist. 1274 * 1275 * Since the memory is already accounted once, don't account it again. 1276 */ 1277 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1278 { 1279 struct io_uring_clone_buffers buf; 1280 struct io_ring_ctx *src_ctx; 1281 bool registered_src; 1282 struct file *file; 1283 int ret; 1284 1285 if (copy_from_user(&buf, arg, sizeof(buf))) 1286 return -EFAULT; 1287 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1288 return -EINVAL; 1289 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1290 return -EBUSY; 1291 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1292 return -EINVAL; 1293 1294 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1295 file = io_uring_register_get_file(buf.src_fd, registered_src); 1296 if (IS_ERR(file)) 1297 return PTR_ERR(file); 1298 1299 src_ctx = file->private_data; 1300 if (src_ctx != ctx) { 1301 mutex_unlock(&ctx->uring_lock); 1302 lock_two_rings(ctx, src_ctx); 1303 1304 if (src_ctx->submitter_task && 1305 src_ctx->submitter_task != current) { 1306 ret = -EEXIST; 1307 goto out; 1308 } 1309 } 1310 1311 ret = io_clone_buffers(ctx, src_ctx, &buf); 1312 1313 out: 1314 if (src_ctx != ctx) 1315 mutex_unlock(&src_ctx->uring_lock); 1316 1317 fput(file); 1318 return ret; 1319 } 1320 1321 void io_vec_free(struct iou_vec *iv) 1322 { 1323 if (!iv->iovec) 1324 return; 1325 kfree(iv->iovec); 1326 iv->iovec = NULL; 1327 iv->nr = 0; 1328 } 1329 1330 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1331 { 1332 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1333 struct iovec *iov; 1334 1335 iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); 1336 if (!iov) 1337 return -ENOMEM; 1338 1339 io_vec_free(iv); 1340 iv->iovec = iov; 1341 iv->nr = nr_entries; 1342 return 0; 1343 } 1344 1345 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, 1346 struct io_mapped_ubuf *imu, 1347 struct iovec *iovec, unsigned nr_iovs, 1348 struct iou_vec *vec) 1349 { 1350 unsigned long folio_size = 1 << imu->folio_shift; 1351 unsigned long folio_mask = folio_size - 1; 1352 struct bio_vec *res_bvec = vec->bvec; 1353 size_t total_len = 0; 1354 unsigned bvec_idx = 0; 1355 unsigned iov_idx; 1356 1357 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1358 size_t iov_len = iovec[iov_idx].iov_len; 1359 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; 1360 struct bio_vec *src_bvec; 1361 size_t offset; 1362 int ret; 1363 1364 ret = validate_fixed_range(buf_addr, iov_len, imu); 1365 if (unlikely(ret)) 1366 return ret; 1367 1368 if (unlikely(!iov_len)) 1369 return -EFAULT; 1370 if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) 1371 return -EOVERFLOW; 1372 1373 offset = buf_addr - imu->ubuf; 1374 /* 1375 * Only the first bvec can have non zero bv_offset, account it 1376 * here and work with full folios below. 1377 */ 1378 offset += imu->bvec[0].bv_offset; 1379 1380 src_bvec = imu->bvec + (offset >> imu->folio_shift); 1381 offset &= folio_mask; 1382 1383 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { 1384 size_t seg_size = min_t(size_t, iov_len, 1385 folio_size - offset); 1386 1387 bvec_set_page(&res_bvec[bvec_idx], 1388 src_bvec->bv_page, seg_size, offset); 1389 iov_len -= seg_size; 1390 } 1391 } 1392 if (total_len > MAX_RW_COUNT) 1393 return -EINVAL; 1394 1395 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); 1396 return 0; 1397 } 1398 1399 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, 1400 struct io_mapped_ubuf *imu) 1401 { 1402 unsigned shift = imu->folio_shift; 1403 size_t max_segs = 0; 1404 unsigned i; 1405 1406 for (i = 0; i < nr_iovs; i++) 1407 max_segs += (iov[i].iov_len >> shift) + 2; 1408 return max_segs; 1409 } 1410 1411 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, 1412 struct io_mapped_ubuf *imu, 1413 struct iovec *iovec, unsigned nr_iovs, 1414 struct iou_vec *vec) 1415 { 1416 const struct bio_vec *src_bvec = imu->bvec; 1417 struct bio_vec *res_bvec = vec->bvec; 1418 unsigned res_idx = 0; 1419 size_t total_len = 0; 1420 unsigned iov_idx; 1421 1422 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1423 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; 1424 size_t iov_len = iovec[iov_idx].iov_len; 1425 struct bvec_iter bi = { 1426 .bi_size = offset + iov_len, 1427 }; 1428 struct bio_vec bv; 1429 1430 bvec_iter_advance(src_bvec, &bi, offset); 1431 for_each_mp_bvec(bv, src_bvec, bi, bi) 1432 res_bvec[res_idx++] = bv; 1433 total_len += iov_len; 1434 } 1435 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); 1436 return 0; 1437 } 1438 1439 static int iov_kern_bvec_size(const struct iovec *iov, 1440 const struct io_mapped_ubuf *imu, 1441 unsigned int *nr_seg) 1442 { 1443 size_t offset = (size_t)(uintptr_t)iov->iov_base; 1444 const struct bio_vec *bvec = imu->bvec; 1445 int start = 0, i = 0; 1446 size_t off = 0; 1447 int ret; 1448 1449 ret = validate_fixed_range(offset, iov->iov_len, imu); 1450 if (unlikely(ret)) 1451 return ret; 1452 1453 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; 1454 off += bvec[i].bv_len, i++) { 1455 if (offset >= off && offset < off + bvec[i].bv_len) 1456 start = i; 1457 } 1458 *nr_seg = i - start; 1459 return 0; 1460 } 1461 1462 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, 1463 struct io_mapped_ubuf *imu, unsigned *nr_segs) 1464 { 1465 unsigned max_segs = 0; 1466 size_t total_len = 0; 1467 unsigned i; 1468 int ret; 1469 1470 *nr_segs = 0; 1471 for (i = 0; i < nr_iovs; i++) { 1472 if (unlikely(!iov[i].iov_len)) 1473 return -EFAULT; 1474 if (unlikely(check_add_overflow(total_len, iov[i].iov_len, 1475 &total_len))) 1476 return -EOVERFLOW; 1477 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); 1478 if (unlikely(ret)) 1479 return ret; 1480 *nr_segs += max_segs; 1481 } 1482 if (total_len > MAX_RW_COUNT) 1483 return -EINVAL; 1484 return 0; 1485 } 1486 1487 int io_import_reg_vec(int ddir, struct iov_iter *iter, 1488 struct io_kiocb *req, struct iou_vec *vec, 1489 unsigned nr_iovs, unsigned issue_flags) 1490 { 1491 struct io_rsrc_node *node; 1492 struct io_mapped_ubuf *imu; 1493 unsigned iovec_off; 1494 struct iovec *iov; 1495 unsigned nr_segs; 1496 1497 node = io_find_buf_node(req, issue_flags); 1498 if (!node) 1499 return -EFAULT; 1500 imu = node->buf; 1501 if (!(imu->dir & (1 << ddir))) 1502 return -EFAULT; 1503 1504 iovec_off = vec->nr - nr_iovs; 1505 iov = vec->iovec + iovec_off; 1506 1507 if (imu->is_kbuf) { 1508 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); 1509 1510 if (unlikely(ret)) 1511 return ret; 1512 } else { 1513 nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); 1514 } 1515 1516 if (sizeof(struct bio_vec) > sizeof(struct iovec)) { 1517 size_t bvec_bytes; 1518 1519 bvec_bytes = nr_segs * sizeof(struct bio_vec); 1520 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); 1521 nr_segs += nr_iovs; 1522 } 1523 1524 if (nr_segs > vec->nr) { 1525 struct iou_vec tmp_vec = {}; 1526 int ret; 1527 1528 ret = io_vec_realloc(&tmp_vec, nr_segs); 1529 if (ret) 1530 return ret; 1531 1532 iovec_off = tmp_vec.nr - nr_iovs; 1533 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); 1534 io_vec_free(vec); 1535 1536 *vec = tmp_vec; 1537 iov = vec->iovec + iovec_off; 1538 req->flags |= REQ_F_NEED_CLEANUP; 1539 } 1540 1541 if (imu->is_kbuf) 1542 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1543 1544 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1545 } 1546 1547 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, 1548 const struct iovec __user *uvec, size_t uvec_segs) 1549 { 1550 struct iovec *iov; 1551 int iovec_off, ret; 1552 void *res; 1553 1554 if (uvec_segs > iv->nr) { 1555 ret = io_vec_realloc(iv, uvec_segs); 1556 if (ret) 1557 return ret; 1558 req->flags |= REQ_F_NEED_CLEANUP; 1559 } 1560 1561 /* pad iovec to the right */ 1562 iovec_off = iv->nr - uvec_segs; 1563 iov = iv->iovec + iovec_off; 1564 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, 1565 io_is_compat(req->ctx)); 1566 if (IS_ERR(res)) 1567 return PTR_ERR(res); 1568 1569 req->flags |= REQ_F_IMPORT_BUFFER; 1570 return 0; 1571 } 1572