1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 #include <linux/io_uring/cmd.h> 13 14 #include <uapi/linux/io_uring.h> 15 16 #include "filetable.h" 17 #include "io_uring.h" 18 #include "openclose.h" 19 #include "rsrc.h" 20 #include "memmap.h" 21 #include "register.h" 22 23 struct io_rsrc_update { 24 struct file *file; 25 u64 arg; 26 u32 nr_args; 27 u32 offset; 28 }; 29 30 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 31 struct iovec *iov, struct page **last_hpage); 32 33 /* only define max */ 34 #define IORING_MAX_FIXED_FILES (1U << 20) 35 #define IORING_MAX_REG_BUFFERS (1U << 14) 36 37 #define IO_CACHED_BVECS_SEGS 32 38 39 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 40 { 41 unsigned long page_limit, cur_pages, new_pages; 42 43 if (!nr_pages) 44 return 0; 45 46 /* Don't allow more pages than we can safely lock */ 47 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 48 49 cur_pages = atomic_long_read(&user->locked_vm); 50 do { 51 new_pages = cur_pages + nr_pages; 52 if (new_pages > page_limit) 53 return -ENOMEM; 54 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 55 &cur_pages, new_pages)); 56 return 0; 57 } 58 59 void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, 60 unsigned long nr_pages) 61 { 62 if (user) 63 __io_unaccount_mem(user, nr_pages); 64 65 if (mm_account) 66 atomic64_sub(nr_pages, &mm_account->pinned_vm); 67 } 68 69 int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, 70 unsigned long nr_pages) 71 { 72 int ret; 73 74 if (user) { 75 ret = __io_account_mem(user, nr_pages); 76 if (ret) 77 return ret; 78 } 79 80 if (mm_account) 81 atomic64_add(nr_pages, &mm_account->pinned_vm); 82 83 return 0; 84 } 85 86 int io_validate_user_buf_range(u64 uaddr, u64 ulen) 87 { 88 unsigned long tmp, base = (unsigned long)uaddr; 89 unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); 90 91 /* arbitrary limit, but we need something */ 92 if (ulen > SZ_1G || !ulen) 93 return -EFAULT; 94 if (check_add_overflow(base, acct_len, &tmp)) 95 return -EOVERFLOW; 96 return 0; 97 } 98 99 static int io_buffer_validate(struct iovec *iov) 100 { 101 /* 102 * Don't impose further limits on the size and buffer 103 * constraints here, we'll -EINVAL later when IO is 104 * submitted if they are wrong. 105 */ 106 if (!iov->iov_base) 107 return iov->iov_len ? -EFAULT : 0; 108 109 return io_validate_user_buf_range((unsigned long)iov->iov_base, 110 iov->iov_len); 111 } 112 113 static void io_release_ubuf(void *priv) 114 { 115 struct io_mapped_ubuf *imu = priv; 116 unsigned int i; 117 118 for (i = 0; i < imu->nr_bvecs; i++) { 119 struct folio *folio = page_folio(imu->bvec[i].bv_page); 120 121 unpin_user_folio(folio, 1); 122 } 123 } 124 125 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 126 int nr_bvecs) 127 { 128 if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 129 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 130 return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), 131 GFP_KERNEL); 132 } 133 134 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 135 { 136 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 137 io_cache_free(&ctx->imu_cache, imu); 138 else 139 kvfree(imu); 140 } 141 142 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 143 { 144 if (unlikely(refcount_read(&imu->refs) > 1)) { 145 if (!refcount_dec_and_test(&imu->refs)) 146 return; 147 } 148 149 if (imu->acct_pages) 150 io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages); 151 imu->release(imu->priv); 152 io_free_imu(ctx, imu); 153 } 154 155 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 156 { 157 struct io_rsrc_node *node; 158 159 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 160 if (node) { 161 node->type = type; 162 node->refs = 1; 163 node->tag = 0; 164 node->file_ptr = 0; 165 } 166 return node; 167 } 168 169 bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 170 { 171 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 172 IO_CACHED_BVECS_SEGS); 173 const int node_size = sizeof(struct io_rsrc_node); 174 bool ret; 175 176 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 177 node_size, 0); 178 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 179 imu_cache_size, 0); 180 return ret; 181 } 182 183 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 184 { 185 io_alloc_cache_free(&ctx->node_cache, kfree); 186 io_alloc_cache_free(&ctx->imu_cache, kfree); 187 } 188 189 static void io_clear_table_tags(struct io_rsrc_data *data) 190 { 191 int i; 192 193 for (i = 0; i < data->nr; i++) { 194 struct io_rsrc_node *node = data->nodes[i]; 195 196 if (node) 197 node->tag = 0; 198 } 199 } 200 201 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 202 struct io_rsrc_data *data) 203 { 204 if (!data->nr) 205 return; 206 while (data->nr--) { 207 if (data->nodes[data->nr]) 208 io_put_rsrc_node(ctx, data->nodes[data->nr]); 209 } 210 kvfree(data->nodes); 211 data->nodes = NULL; 212 data->nr = 0; 213 } 214 215 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 216 { 217 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), 218 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 219 if (data->nodes) { 220 data->nr = nr; 221 return 0; 222 } 223 return -ENOMEM; 224 } 225 226 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 227 struct io_uring_rsrc_update2 *up, 228 unsigned nr_args) 229 { 230 u64 __user *tags = u64_to_user_ptr(up->tags); 231 __s32 __user *fds = u64_to_user_ptr(up->data); 232 int fd, i, err = 0; 233 unsigned int done; 234 235 if (!ctx->file_table.data.nr) 236 return -ENXIO; 237 if (up->offset + nr_args > ctx->file_table.data.nr) 238 return -EINVAL; 239 240 for (done = 0; done < nr_args; done++) { 241 u64 tag = 0; 242 243 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 244 copy_from_user(&fd, &fds[done], sizeof(fd))) { 245 err = -EFAULT; 246 break; 247 } 248 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 249 err = -EINVAL; 250 break; 251 } 252 if (fd == IORING_REGISTER_FILES_SKIP) 253 continue; 254 255 i = up->offset + done; 256 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 257 io_file_bitmap_clear(&ctx->file_table, i); 258 259 if (fd != -1) { 260 struct file *file = fget(fd); 261 struct io_rsrc_node *node; 262 263 if (!file) { 264 err = -EBADF; 265 break; 266 } 267 /* 268 * Don't allow io_uring instances to be registered. 269 */ 270 if (io_is_uring_fops(file)) { 271 fput(file); 272 err = -EBADF; 273 break; 274 } 275 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 276 if (!node) { 277 err = -ENOMEM; 278 fput(file); 279 break; 280 } 281 ctx->file_table.data.nodes[i] = node; 282 if (tag) 283 node->tag = tag; 284 io_fixed_file_set(node, file); 285 io_file_bitmap_set(&ctx->file_table, i); 286 } 287 } 288 return done ? done : err; 289 } 290 291 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 292 struct io_uring_rsrc_update2 *up, 293 unsigned int nr_args) 294 { 295 u64 __user *tags = u64_to_user_ptr(up->tags); 296 struct iovec fast_iov, *iov; 297 struct page *last_hpage = NULL; 298 struct iovec __user *uvec; 299 u64 user_data = up->data; 300 __u32 done; 301 int i, err; 302 303 if (!ctx->buf_table.nr) 304 return -ENXIO; 305 if (up->offset + nr_args > ctx->buf_table.nr) 306 return -EINVAL; 307 308 for (done = 0; done < nr_args; done++) { 309 struct io_rsrc_node *node; 310 u64 tag = 0; 311 312 uvec = u64_to_user_ptr(user_data); 313 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 314 if (IS_ERR(iov)) { 315 err = PTR_ERR(iov); 316 break; 317 } 318 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 319 err = -EFAULT; 320 break; 321 } 322 err = io_buffer_validate(iov); 323 if (err) 324 break; 325 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 326 if (IS_ERR(node)) { 327 err = PTR_ERR(node); 328 break; 329 } 330 if (tag) { 331 if (!node) { 332 err = -EINVAL; 333 break; 334 } 335 node->tag = tag; 336 } 337 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 338 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 339 ctx->buf_table.nodes[i] = node; 340 if (ctx->compat) 341 user_data += sizeof(struct compat_iovec); 342 else 343 user_data += sizeof(struct iovec); 344 } 345 return done ? done : err; 346 } 347 348 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 349 struct io_uring_rsrc_update2 *up, 350 unsigned nr_args) 351 { 352 __u32 tmp; 353 354 lockdep_assert_held(&ctx->uring_lock); 355 356 if (check_add_overflow(up->offset, nr_args, &tmp)) 357 return -EOVERFLOW; 358 359 switch (type) { 360 case IORING_RSRC_FILE: 361 return __io_sqe_files_update(ctx, up, nr_args); 362 case IORING_RSRC_BUFFER: 363 return __io_sqe_buffers_update(ctx, up, nr_args); 364 } 365 return -EINVAL; 366 } 367 368 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 369 unsigned nr_args) 370 { 371 struct io_uring_rsrc_update2 up; 372 373 if (!nr_args) 374 return -EINVAL; 375 memset(&up, 0, sizeof(up)); 376 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 377 return -EFAULT; 378 if (up.resv || up.resv2) 379 return -EINVAL; 380 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 381 } 382 383 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 384 unsigned size, unsigned type) 385 { 386 struct io_uring_rsrc_update2 up; 387 388 if (size != sizeof(up)) 389 return -EINVAL; 390 if (copy_from_user(&up, arg, sizeof(up))) 391 return -EFAULT; 392 if (!up.nr || up.resv || up.resv2) 393 return -EINVAL; 394 return __io_register_rsrc_update(ctx, type, &up, up.nr); 395 } 396 397 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 398 unsigned int size, unsigned int type) 399 { 400 struct io_uring_rsrc_register rr; 401 402 /* keep it extendible */ 403 if (size != sizeof(rr)) 404 return -EINVAL; 405 406 memset(&rr, 0, sizeof(rr)); 407 if (copy_from_user(&rr, arg, size)) 408 return -EFAULT; 409 if (!rr.nr || rr.resv2) 410 return -EINVAL; 411 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 412 return -EINVAL; 413 414 switch (type) { 415 case IORING_RSRC_FILE: 416 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 417 break; 418 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 419 rr.nr, u64_to_user_ptr(rr.tags)); 420 case IORING_RSRC_BUFFER: 421 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 422 break; 423 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 424 rr.nr, u64_to_user_ptr(rr.tags)); 425 } 426 return -EINVAL; 427 } 428 429 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 430 { 431 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 432 433 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 434 return -EINVAL; 435 if (sqe->rw_flags || sqe->splice_fd_in) 436 return -EINVAL; 437 438 up->offset = READ_ONCE(sqe->off); 439 up->nr_args = READ_ONCE(sqe->len); 440 if (!up->nr_args) 441 return -EINVAL; 442 up->arg = READ_ONCE(sqe->addr); 443 return 0; 444 } 445 446 static int io_files_update_with_index_alloc(struct io_kiocb *req, 447 unsigned int issue_flags) 448 { 449 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 450 __s32 __user *fds = u64_to_user_ptr(up->arg); 451 unsigned int done; 452 struct file *file; 453 int ret, fd; 454 455 if (!req->ctx->file_table.data.nr) 456 return -ENXIO; 457 458 for (done = 0; done < up->nr_args; done++) { 459 if (get_user(fd, &fds[done])) { 460 ret = -EFAULT; 461 break; 462 } 463 464 file = fget(fd); 465 if (!file) { 466 ret = -EBADF; 467 break; 468 } 469 ret = io_fixed_fd_install(req, issue_flags, file, 470 IORING_FILE_INDEX_ALLOC); 471 if (ret < 0) 472 break; 473 if (put_user(ret, &fds[done])) { 474 __io_close_fixed(req->ctx, issue_flags, ret); 475 ret = -EFAULT; 476 break; 477 } 478 } 479 480 if (done) 481 return done; 482 return ret; 483 } 484 485 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 486 { 487 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 488 struct io_ring_ctx *ctx = req->ctx; 489 struct io_uring_rsrc_update2 up2; 490 int ret; 491 492 up2.offset = up->offset; 493 up2.data = up->arg; 494 up2.nr = 0; 495 up2.tags = 0; 496 up2.resv = 0; 497 up2.resv2 = 0; 498 499 if (up->offset == IORING_FILE_INDEX_ALLOC) { 500 ret = io_files_update_with_index_alloc(req, issue_flags); 501 } else { 502 io_ring_submit_lock(ctx, issue_flags); 503 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 504 &up2, up->nr_args); 505 io_ring_submit_unlock(ctx, issue_flags); 506 } 507 508 if (ret < 0) 509 req_set_fail(req); 510 io_req_set_res(req, ret, 0); 511 return IOU_COMPLETE; 512 } 513 514 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 515 { 516 if (node->tag) 517 io_post_aux_cqe(ctx, node->tag, 0, 0); 518 519 switch (node->type) { 520 case IORING_RSRC_FILE: 521 fput(io_slot_file(node)); 522 break; 523 case IORING_RSRC_BUFFER: 524 io_buffer_unmap(ctx, node->buf); 525 break; 526 default: 527 WARN_ON_ONCE(1); 528 break; 529 } 530 531 io_cache_free(&ctx->node_cache, node); 532 } 533 534 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 535 { 536 if (!ctx->file_table.data.nr) 537 return -ENXIO; 538 539 io_free_file_tables(ctx, &ctx->file_table); 540 io_file_table_set_alloc_range(ctx, 0, 0); 541 return 0; 542 } 543 544 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 545 unsigned nr_args, u64 __user *tags) 546 { 547 __s32 __user *fds = (__s32 __user *) arg; 548 struct file *file; 549 int fd, ret; 550 unsigned i; 551 552 if (ctx->file_table.data.nr) 553 return -EBUSY; 554 if (!nr_args) 555 return -EINVAL; 556 if (nr_args > IORING_MAX_FIXED_FILES) 557 return -EMFILE; 558 if (nr_args > rlimit(RLIMIT_NOFILE)) 559 return -EMFILE; 560 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 561 return -ENOMEM; 562 563 for (i = 0; i < nr_args; i++) { 564 struct io_rsrc_node *node; 565 u64 tag = 0; 566 567 ret = -EFAULT; 568 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 569 goto fail; 570 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 571 goto fail; 572 /* allow sparse sets */ 573 if (!fds || fd == -1) { 574 ret = -EINVAL; 575 if (tag) 576 goto fail; 577 continue; 578 } 579 580 file = fget(fd); 581 ret = -EBADF; 582 if (unlikely(!file)) 583 goto fail; 584 585 /* 586 * Don't allow io_uring instances to be registered. 587 */ 588 if (io_is_uring_fops(file)) { 589 fput(file); 590 goto fail; 591 } 592 ret = -ENOMEM; 593 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 594 if (!node) { 595 fput(file); 596 goto fail; 597 } 598 if (tag) 599 node->tag = tag; 600 ctx->file_table.data.nodes[i] = node; 601 io_fixed_file_set(node, file); 602 io_file_bitmap_set(&ctx->file_table, i); 603 } 604 605 /* default it to the whole table */ 606 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 607 return 0; 608 fail: 609 io_clear_table_tags(&ctx->file_table.data); 610 io_sqe_files_unregister(ctx); 611 return ret; 612 } 613 614 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 615 { 616 if (!ctx->buf_table.nr) 617 return -ENXIO; 618 io_rsrc_data_free(ctx, &ctx->buf_table); 619 return 0; 620 } 621 622 /* 623 * Not super efficient, but this is just a registration time. And we do cache 624 * the last compound head, so generally we'll only do a full search if we don't 625 * match that one. 626 * 627 * We check if the given compound head page has already been accounted, to 628 * avoid double accounting it. This allows us to account the full size of the 629 * page, not just the constituent pages of a huge page. 630 */ 631 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 632 int nr_pages, struct page *hpage) 633 { 634 int i, j; 635 636 /* check current page array */ 637 for (i = 0; i < nr_pages; i++) { 638 if (!PageCompound(pages[i])) 639 continue; 640 if (compound_head(pages[i]) == hpage) 641 return true; 642 } 643 644 /* check previously registered pages */ 645 for (i = 0; i < ctx->buf_table.nr; i++) { 646 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 647 struct io_mapped_ubuf *imu; 648 649 if (!node) 650 continue; 651 imu = node->buf; 652 for (j = 0; j < imu->nr_bvecs; j++) { 653 if (!PageCompound(imu->bvec[j].bv_page)) 654 continue; 655 if (compound_head(imu->bvec[j].bv_page) == hpage) 656 return true; 657 } 658 } 659 660 return false; 661 } 662 663 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 664 int nr_pages, struct io_mapped_ubuf *imu, 665 struct page **last_hpage) 666 { 667 int i, ret; 668 669 imu->acct_pages = 0; 670 for (i = 0; i < nr_pages; i++) { 671 if (!PageCompound(pages[i])) { 672 imu->acct_pages++; 673 } else { 674 struct page *hpage; 675 676 hpage = compound_head(pages[i]); 677 if (hpage == *last_hpage) 678 continue; 679 *last_hpage = hpage; 680 if (headpage_already_acct(ctx, pages, i, hpage)) 681 continue; 682 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 683 } 684 } 685 686 if (!imu->acct_pages) 687 return 0; 688 689 ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages); 690 if (ret) 691 imu->acct_pages = 0; 692 return ret; 693 } 694 695 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 696 struct io_imu_folio_data *data) 697 { 698 struct page **page_array = *pages, **new_array = NULL; 699 unsigned nr_pages_left = *nr_pages; 700 unsigned nr_folios = data->nr_folios; 701 unsigned i, j; 702 703 /* Store head pages only*/ 704 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); 705 if (!new_array) 706 return false; 707 708 for (i = 0, j = 0; i < nr_folios; i++) { 709 struct page *p = compound_head(page_array[j]); 710 struct folio *folio = page_folio(p); 711 unsigned int nr; 712 713 WARN_ON_ONCE(i > 0 && p != page_array[j]); 714 715 nr = i ? data->nr_pages_mid : data->nr_pages_head; 716 nr = min(nr, nr_pages_left); 717 /* Drop all but one ref, the entire folio will remain pinned. */ 718 if (nr > 1) 719 unpin_user_folio(folio, nr - 1); 720 j += nr; 721 nr_pages_left -= nr; 722 new_array[i] = p; 723 } 724 725 WARN_ON_ONCE(j != *nr_pages); 726 727 kvfree(page_array); 728 *pages = new_array; 729 *nr_pages = nr_folios; 730 return true; 731 } 732 733 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 734 struct io_imu_folio_data *data) 735 { 736 struct folio *folio = page_folio(page_array[0]); 737 unsigned int count = 1, nr_folios = 1; 738 int i; 739 740 data->nr_pages_mid = folio_nr_pages(folio); 741 data->folio_shift = folio_shift(folio); 742 data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); 743 744 /* 745 * Check if pages are contiguous inside a folio, and all folios have 746 * the same page count except for the head and tail. 747 */ 748 for (i = 1; i < nr_pages; i++) { 749 if (page_folio(page_array[i]) == folio && 750 page_array[i] == page_array[i-1] + 1) { 751 count++; 752 continue; 753 } 754 755 if (nr_folios == 1) { 756 if (folio_page_idx(folio, page_array[i-1]) != 757 data->nr_pages_mid - 1) 758 return false; 759 760 data->nr_pages_head = count; 761 } else if (count != data->nr_pages_mid) { 762 return false; 763 } 764 765 folio = page_folio(page_array[i]); 766 if (folio_size(folio) != (1UL << data->folio_shift) || 767 folio_page_idx(folio, page_array[i]) != 0) 768 return false; 769 770 count = 1; 771 nr_folios++; 772 } 773 if (nr_folios == 1) 774 data->nr_pages_head = count; 775 776 data->nr_folios = nr_folios; 777 return true; 778 } 779 780 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 781 struct iovec *iov, 782 struct page **last_hpage) 783 { 784 struct io_mapped_ubuf *imu = NULL; 785 struct page **pages = NULL; 786 struct io_rsrc_node *node; 787 unsigned long off; 788 size_t size; 789 int ret, nr_pages, i; 790 struct io_imu_folio_data data; 791 bool coalesced = false; 792 793 if (!iov->iov_base) 794 return NULL; 795 796 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 797 if (!node) 798 return ERR_PTR(-ENOMEM); 799 800 ret = -ENOMEM; 801 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 802 &nr_pages); 803 if (IS_ERR(pages)) { 804 ret = PTR_ERR(pages); 805 pages = NULL; 806 goto done; 807 } 808 809 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 810 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 811 if (data.nr_pages_mid != 1) 812 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 813 } 814 815 imu = io_alloc_imu(ctx, nr_pages); 816 if (!imu) 817 goto done; 818 819 imu->nr_bvecs = nr_pages; 820 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 821 if (ret) 822 goto done; 823 824 size = iov->iov_len; 825 /* store original address for later verification */ 826 imu->ubuf = (unsigned long) iov->iov_base; 827 imu->len = iov->iov_len; 828 imu->folio_shift = PAGE_SHIFT; 829 imu->release = io_release_ubuf; 830 imu->priv = imu; 831 imu->is_kbuf = false; 832 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 833 if (coalesced) 834 imu->folio_shift = data.folio_shift; 835 refcount_set(&imu->refs, 1); 836 837 off = (unsigned long)iov->iov_base & ~PAGE_MASK; 838 if (coalesced) 839 off += data.first_folio_page_idx << PAGE_SHIFT; 840 841 node->buf = imu; 842 ret = 0; 843 844 for (i = 0; i < nr_pages; i++) { 845 size_t vec_len; 846 847 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 848 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 849 off = 0; 850 size -= vec_len; 851 } 852 done: 853 if (ret) { 854 if (imu) 855 io_free_imu(ctx, imu); 856 if (pages) { 857 for (i = 0; i < nr_pages; i++) 858 unpin_user_folio(page_folio(pages[i]), 1); 859 } 860 io_cache_free(&ctx->node_cache, node); 861 node = ERR_PTR(ret); 862 } 863 kvfree(pages); 864 return node; 865 } 866 867 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 868 unsigned int nr_args, u64 __user *tags) 869 { 870 struct page *last_hpage = NULL; 871 struct io_rsrc_data data; 872 struct iovec fast_iov, *iov = &fast_iov; 873 const struct iovec __user *uvec; 874 int i, ret; 875 876 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 877 878 if (ctx->buf_table.nr) 879 return -EBUSY; 880 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 881 return -EINVAL; 882 ret = io_rsrc_data_alloc(&data, nr_args); 883 if (ret) 884 return ret; 885 886 if (!arg) 887 memset(iov, 0, sizeof(*iov)); 888 889 for (i = 0; i < nr_args; i++) { 890 struct io_rsrc_node *node; 891 u64 tag = 0; 892 893 if (arg) { 894 uvec = (struct iovec __user *) arg; 895 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 896 if (IS_ERR(iov)) { 897 ret = PTR_ERR(iov); 898 break; 899 } 900 ret = io_buffer_validate(iov); 901 if (ret) 902 break; 903 if (ctx->compat) 904 arg += sizeof(struct compat_iovec); 905 else 906 arg += sizeof(struct iovec); 907 } 908 909 if (tags) { 910 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 911 ret = -EFAULT; 912 break; 913 } 914 } 915 916 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 917 if (IS_ERR(node)) { 918 ret = PTR_ERR(node); 919 break; 920 } 921 if (tag) { 922 if (!node) { 923 ret = -EINVAL; 924 break; 925 } 926 node->tag = tag; 927 } 928 data.nodes[i] = node; 929 } 930 931 ctx->buf_table = data; 932 if (ret) { 933 io_clear_table_tags(&ctx->buf_table); 934 io_sqe_buffers_unregister(ctx); 935 } 936 return ret; 937 } 938 939 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 940 void (*release)(void *), unsigned int index, 941 unsigned int issue_flags) 942 { 943 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 944 struct io_rsrc_data *data = &ctx->buf_table; 945 struct req_iterator rq_iter; 946 struct io_mapped_ubuf *imu; 947 struct io_rsrc_node *node; 948 struct bio_vec bv; 949 unsigned int nr_bvecs = 0; 950 int ret = 0; 951 952 io_ring_submit_lock(ctx, issue_flags); 953 if (index >= data->nr) { 954 ret = -EINVAL; 955 goto unlock; 956 } 957 index = array_index_nospec(index, data->nr); 958 959 if (data->nodes[index]) { 960 ret = -EBUSY; 961 goto unlock; 962 } 963 964 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 965 if (!node) { 966 ret = -ENOMEM; 967 goto unlock; 968 } 969 970 /* 971 * blk_rq_nr_phys_segments() may overestimate the number of bvecs 972 * but avoids needing to iterate over the bvecs 973 */ 974 imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); 975 if (!imu) { 976 kfree(node); 977 ret = -ENOMEM; 978 goto unlock; 979 } 980 981 imu->ubuf = 0; 982 imu->len = blk_rq_bytes(rq); 983 imu->acct_pages = 0; 984 imu->folio_shift = PAGE_SHIFT; 985 refcount_set(&imu->refs, 1); 986 imu->release = release; 987 imu->priv = rq; 988 imu->is_kbuf = true; 989 imu->dir = 1 << rq_data_dir(rq); 990 991 rq_for_each_bvec(bv, rq, rq_iter) 992 imu->bvec[nr_bvecs++] = bv; 993 imu->nr_bvecs = nr_bvecs; 994 995 node->buf = imu; 996 data->nodes[index] = node; 997 unlock: 998 io_ring_submit_unlock(ctx, issue_flags); 999 return ret; 1000 } 1001 EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 1002 1003 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 1004 unsigned int issue_flags) 1005 { 1006 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 1007 struct io_rsrc_data *data = &ctx->buf_table; 1008 struct io_rsrc_node *node; 1009 int ret = 0; 1010 1011 io_ring_submit_lock(ctx, issue_flags); 1012 if (index >= data->nr) { 1013 ret = -EINVAL; 1014 goto unlock; 1015 } 1016 index = array_index_nospec(index, data->nr); 1017 1018 node = data->nodes[index]; 1019 if (!node) { 1020 ret = -EINVAL; 1021 goto unlock; 1022 } 1023 if (!node->buf->is_kbuf) { 1024 ret = -EBUSY; 1025 goto unlock; 1026 } 1027 1028 io_put_rsrc_node(ctx, node); 1029 data->nodes[index] = NULL; 1030 unlock: 1031 io_ring_submit_unlock(ctx, issue_flags); 1032 return ret; 1033 } 1034 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 1035 1036 static int validate_fixed_range(u64 buf_addr, size_t len, 1037 const struct io_mapped_ubuf *imu) 1038 { 1039 u64 buf_end; 1040 1041 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1042 return -EFAULT; 1043 /* not inside the mapped region */ 1044 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1045 return -EFAULT; 1046 if (unlikely(len > MAX_RW_COUNT)) 1047 return -EFAULT; 1048 return 0; 1049 } 1050 1051 static int io_import_kbuf(int ddir, struct iov_iter *iter, 1052 struct io_mapped_ubuf *imu, size_t len, size_t offset) 1053 { 1054 size_t count = len + offset; 1055 1056 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); 1057 iov_iter_advance(iter, offset); 1058 1059 if (count < imu->len) { 1060 const struct bio_vec *bvec = iter->bvec; 1061 1062 while (len > bvec->bv_len) { 1063 len -= bvec->bv_len; 1064 bvec++; 1065 } 1066 iter->nr_segs = 1 + bvec - iter->bvec; 1067 } 1068 return 0; 1069 } 1070 1071 static int io_import_fixed(int ddir, struct iov_iter *iter, 1072 struct io_mapped_ubuf *imu, 1073 u64 buf_addr, size_t len) 1074 { 1075 const struct bio_vec *bvec; 1076 size_t folio_mask; 1077 unsigned nr_segs; 1078 size_t offset; 1079 int ret; 1080 1081 ret = validate_fixed_range(buf_addr, len, imu); 1082 if (unlikely(ret)) 1083 return ret; 1084 if (!(imu->dir & (1 << ddir))) 1085 return -EFAULT; 1086 1087 offset = buf_addr - imu->ubuf; 1088 1089 if (imu->is_kbuf) 1090 return io_import_kbuf(ddir, iter, imu, len, offset); 1091 1092 /* 1093 * Don't use iov_iter_advance() here, as it's really slow for 1094 * using the latter parts of a big fixed buffer - it iterates 1095 * over each segment manually. We can cheat a bit here for user 1096 * registered nodes, because we know that: 1097 * 1098 * 1) it's a BVEC iter, we set it up 1099 * 2) all bvecs are the same in size, except potentially the 1100 * first and last bvec 1101 */ 1102 folio_mask = (1UL << imu->folio_shift) - 1; 1103 bvec = imu->bvec; 1104 if (offset >= bvec->bv_len) { 1105 unsigned long seg_skip; 1106 1107 /* skip first vec */ 1108 offset -= bvec->bv_len; 1109 seg_skip = 1 + (offset >> imu->folio_shift); 1110 bvec += seg_skip; 1111 offset &= folio_mask; 1112 } 1113 nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; 1114 iov_iter_bvec(iter, ddir, bvec, nr_segs, len); 1115 iter->iov_offset = offset; 1116 return 0; 1117 } 1118 1119 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 1120 unsigned issue_flags) 1121 { 1122 struct io_ring_ctx *ctx = req->ctx; 1123 struct io_rsrc_node *node; 1124 1125 if (req->flags & REQ_F_BUF_NODE) 1126 return req->buf_node; 1127 req->flags |= REQ_F_BUF_NODE; 1128 1129 io_ring_submit_lock(ctx, issue_flags); 1130 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 1131 if (node) { 1132 node->refs++; 1133 req->buf_node = node; 1134 io_ring_submit_unlock(ctx, issue_flags); 1135 return node; 1136 } 1137 req->flags &= ~REQ_F_BUF_NODE; 1138 io_ring_submit_unlock(ctx, issue_flags); 1139 return NULL; 1140 } 1141 1142 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 1143 u64 buf_addr, size_t len, int ddir, 1144 unsigned issue_flags) 1145 { 1146 struct io_rsrc_node *node; 1147 1148 node = io_find_buf_node(req, issue_flags); 1149 if (!node) 1150 return -EFAULT; 1151 return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1152 } 1153 1154 /* Lock two rings at once. The rings must be different! */ 1155 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 1156 { 1157 if (ctx1 > ctx2) 1158 swap(ctx1, ctx2); 1159 mutex_lock(&ctx1->uring_lock); 1160 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 1161 } 1162 1163 /* Both rings are locked by the caller. */ 1164 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 1165 struct io_uring_clone_buffers *arg) 1166 { 1167 struct io_rsrc_data data; 1168 int i, ret, off, nr; 1169 unsigned int nbufs; 1170 1171 lockdep_assert_held(&ctx->uring_lock); 1172 lockdep_assert_held(&src_ctx->uring_lock); 1173 1174 /* 1175 * Accounting state is shared between the two rings; that only works if 1176 * both rings are accounted towards the same counters. 1177 */ 1178 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 1179 return -EINVAL; 1180 1181 /* if offsets are given, must have nr specified too */ 1182 if (!arg->nr && (arg->dst_off || arg->src_off)) 1183 return -EINVAL; 1184 /* not allowed unless REPLACE is set */ 1185 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 1186 return -EBUSY; 1187 1188 nbufs = src_ctx->buf_table.nr; 1189 if (!arg->nr) 1190 arg->nr = nbufs; 1191 else if (arg->nr > nbufs) 1192 return -EINVAL; 1193 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1194 return -EINVAL; 1195 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1196 return -EOVERFLOW; 1197 if (nbufs > IORING_MAX_REG_BUFFERS) 1198 return -EINVAL; 1199 1200 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 1201 if (ret) 1202 return ret; 1203 1204 /* Fill entries in data from dst that won't overlap with src */ 1205 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1206 struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; 1207 1208 if (src_node) { 1209 data.nodes[i] = src_node; 1210 src_node->refs++; 1211 } 1212 } 1213 1214 ret = -ENXIO; 1215 nbufs = src_ctx->buf_table.nr; 1216 if (!nbufs) 1217 goto out_free; 1218 ret = -EINVAL; 1219 if (!arg->nr) 1220 arg->nr = nbufs; 1221 else if (arg->nr > nbufs) 1222 goto out_free; 1223 ret = -EOVERFLOW; 1224 if (check_add_overflow(arg->nr, arg->src_off, &off)) 1225 goto out_free; 1226 if (off > nbufs) 1227 goto out_free; 1228 1229 off = arg->dst_off; 1230 i = arg->src_off; 1231 nr = arg->nr; 1232 while (nr--) { 1233 struct io_rsrc_node *dst_node, *src_node; 1234 1235 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1236 if (!src_node) { 1237 dst_node = NULL; 1238 } else { 1239 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1240 if (!dst_node) { 1241 ret = -ENOMEM; 1242 goto out_free; 1243 } 1244 1245 refcount_inc(&src_node->buf->refs); 1246 dst_node->buf = src_node->buf; 1247 } 1248 data.nodes[off++] = dst_node; 1249 i++; 1250 } 1251 1252 /* 1253 * If asked for replace, put the old table. data->nodes[] holds both 1254 * old and new nodes at this point. 1255 */ 1256 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1257 io_rsrc_data_free(ctx, &ctx->buf_table); 1258 1259 /* 1260 * ctx->buf_table must be empty now - either the contents are being 1261 * replaced and we just freed the table, or the contents are being 1262 * copied to a ring that does not have buffers yet (checked at function 1263 * entry). 1264 */ 1265 WARN_ON_ONCE(ctx->buf_table.nr); 1266 ctx->buf_table = data; 1267 return 0; 1268 1269 out_free: 1270 io_rsrc_data_free(ctx, &data); 1271 return ret; 1272 } 1273 1274 /* 1275 * Copy the registered buffers from the source ring whose file descriptor 1276 * is given in the src_fd to the current ring. This is identical to registering 1277 * the buffers with ctx, except faster as mappings already exist. 1278 * 1279 * Since the memory is already accounted once, don't account it again. 1280 */ 1281 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1282 { 1283 struct io_uring_clone_buffers buf; 1284 struct io_ring_ctx *src_ctx; 1285 bool registered_src; 1286 struct file *file; 1287 int ret; 1288 1289 if (copy_from_user(&buf, arg, sizeof(buf))) 1290 return -EFAULT; 1291 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1292 return -EINVAL; 1293 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1294 return -EBUSY; 1295 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1296 return -EINVAL; 1297 1298 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1299 file = io_uring_register_get_file(buf.src_fd, registered_src); 1300 if (IS_ERR(file)) 1301 return PTR_ERR(file); 1302 1303 src_ctx = file->private_data; 1304 if (src_ctx != ctx) { 1305 mutex_unlock(&ctx->uring_lock); 1306 lock_two_rings(ctx, src_ctx); 1307 1308 if (src_ctx->submitter_task && 1309 src_ctx->submitter_task != current) { 1310 ret = -EEXIST; 1311 goto out; 1312 } 1313 } 1314 1315 ret = io_clone_buffers(ctx, src_ctx, &buf); 1316 1317 out: 1318 if (src_ctx != ctx) 1319 mutex_unlock(&src_ctx->uring_lock); 1320 1321 fput(file); 1322 return ret; 1323 } 1324 1325 void io_vec_free(struct iou_vec *iv) 1326 { 1327 if (!iv->iovec) 1328 return; 1329 kfree(iv->iovec); 1330 iv->iovec = NULL; 1331 iv->nr = 0; 1332 } 1333 1334 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1335 { 1336 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1337 struct iovec *iov; 1338 1339 iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); 1340 if (!iov) 1341 return -ENOMEM; 1342 1343 io_vec_free(iv); 1344 iv->iovec = iov; 1345 iv->nr = nr_entries; 1346 return 0; 1347 } 1348 1349 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, 1350 struct io_mapped_ubuf *imu, 1351 struct iovec *iovec, unsigned nr_iovs, 1352 struct iou_vec *vec) 1353 { 1354 unsigned long folio_size = 1 << imu->folio_shift; 1355 unsigned long folio_mask = folio_size - 1; 1356 struct bio_vec *res_bvec = vec->bvec; 1357 size_t total_len = 0; 1358 unsigned bvec_idx = 0; 1359 unsigned iov_idx; 1360 1361 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1362 size_t iov_len = iovec[iov_idx].iov_len; 1363 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; 1364 struct bio_vec *src_bvec; 1365 size_t offset; 1366 int ret; 1367 1368 ret = validate_fixed_range(buf_addr, iov_len, imu); 1369 if (unlikely(ret)) 1370 return ret; 1371 1372 if (unlikely(!iov_len)) 1373 return -EFAULT; 1374 if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) 1375 return -EOVERFLOW; 1376 1377 offset = buf_addr - imu->ubuf; 1378 /* 1379 * Only the first bvec can have non zero bv_offset, account it 1380 * here and work with full folios below. 1381 */ 1382 offset += imu->bvec[0].bv_offset; 1383 1384 src_bvec = imu->bvec + (offset >> imu->folio_shift); 1385 offset &= folio_mask; 1386 1387 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { 1388 size_t seg_size = min_t(size_t, iov_len, 1389 folio_size - offset); 1390 1391 bvec_set_page(&res_bvec[bvec_idx], 1392 src_bvec->bv_page, seg_size, offset); 1393 iov_len -= seg_size; 1394 } 1395 } 1396 if (total_len > MAX_RW_COUNT) 1397 return -EINVAL; 1398 1399 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); 1400 return 0; 1401 } 1402 1403 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, 1404 struct io_mapped_ubuf *imu) 1405 { 1406 unsigned shift = imu->folio_shift; 1407 size_t max_segs = 0; 1408 unsigned i; 1409 1410 for (i = 0; i < nr_iovs; i++) { 1411 max_segs += (iov[i].iov_len >> shift) + 2; 1412 if (max_segs > INT_MAX) 1413 return -EOVERFLOW; 1414 } 1415 return max_segs; 1416 } 1417 1418 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, 1419 struct io_mapped_ubuf *imu, 1420 struct iovec *iovec, unsigned nr_iovs, 1421 struct iou_vec *vec) 1422 { 1423 const struct bio_vec *src_bvec = imu->bvec; 1424 struct bio_vec *res_bvec = vec->bvec; 1425 unsigned res_idx = 0; 1426 size_t total_len = 0; 1427 unsigned iov_idx; 1428 1429 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1430 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; 1431 size_t iov_len = iovec[iov_idx].iov_len; 1432 struct bvec_iter bi = { 1433 .bi_size = offset + iov_len, 1434 }; 1435 struct bio_vec bv; 1436 1437 bvec_iter_advance(src_bvec, &bi, offset); 1438 for_each_mp_bvec(bv, src_bvec, bi, bi) 1439 res_bvec[res_idx++] = bv; 1440 total_len += iov_len; 1441 } 1442 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); 1443 return 0; 1444 } 1445 1446 static int iov_kern_bvec_size(const struct iovec *iov, 1447 const struct io_mapped_ubuf *imu, 1448 unsigned int *nr_seg) 1449 { 1450 size_t offset = (size_t)(uintptr_t)iov->iov_base; 1451 const struct bio_vec *bvec = imu->bvec; 1452 int start = 0, i = 0; 1453 size_t off = 0; 1454 int ret; 1455 1456 ret = validate_fixed_range(offset, iov->iov_len, imu); 1457 if (unlikely(ret)) 1458 return ret; 1459 1460 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; 1461 off += bvec[i].bv_len, i++) { 1462 if (offset >= off && offset < off + bvec[i].bv_len) 1463 start = i; 1464 } 1465 *nr_seg = i - start; 1466 return 0; 1467 } 1468 1469 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, 1470 struct io_mapped_ubuf *imu, unsigned *nr_segs) 1471 { 1472 unsigned max_segs = 0; 1473 size_t total_len = 0; 1474 unsigned i; 1475 int ret; 1476 1477 *nr_segs = 0; 1478 for (i = 0; i < nr_iovs; i++) { 1479 if (unlikely(!iov[i].iov_len)) 1480 return -EFAULT; 1481 if (unlikely(check_add_overflow(total_len, iov[i].iov_len, 1482 &total_len))) 1483 return -EOVERFLOW; 1484 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); 1485 if (unlikely(ret)) 1486 return ret; 1487 *nr_segs += max_segs; 1488 } 1489 if (total_len > MAX_RW_COUNT) 1490 return -EINVAL; 1491 return 0; 1492 } 1493 1494 int io_import_reg_vec(int ddir, struct iov_iter *iter, 1495 struct io_kiocb *req, struct iou_vec *vec, 1496 unsigned nr_iovs, unsigned issue_flags) 1497 { 1498 struct io_rsrc_node *node; 1499 struct io_mapped_ubuf *imu; 1500 unsigned iovec_off; 1501 struct iovec *iov; 1502 unsigned nr_segs; 1503 1504 node = io_find_buf_node(req, issue_flags); 1505 if (!node) 1506 return -EFAULT; 1507 imu = node->buf; 1508 if (!(imu->dir & (1 << ddir))) 1509 return -EFAULT; 1510 1511 iovec_off = vec->nr - nr_iovs; 1512 iov = vec->iovec + iovec_off; 1513 1514 if (imu->is_kbuf) { 1515 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); 1516 1517 if (unlikely(ret)) 1518 return ret; 1519 } else { 1520 int ret = io_estimate_bvec_size(iov, nr_iovs, imu); 1521 1522 if (ret < 0) 1523 return ret; 1524 nr_segs = ret; 1525 } 1526 1527 if (sizeof(struct bio_vec) > sizeof(struct iovec)) { 1528 size_t bvec_bytes; 1529 1530 bvec_bytes = nr_segs * sizeof(struct bio_vec); 1531 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); 1532 nr_segs += nr_iovs; 1533 } 1534 1535 if (nr_segs > vec->nr) { 1536 struct iou_vec tmp_vec = {}; 1537 int ret; 1538 1539 ret = io_vec_realloc(&tmp_vec, nr_segs); 1540 if (ret) 1541 return ret; 1542 1543 iovec_off = tmp_vec.nr - nr_iovs; 1544 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); 1545 io_vec_free(vec); 1546 1547 *vec = tmp_vec; 1548 iov = vec->iovec + iovec_off; 1549 req->flags |= REQ_F_NEED_CLEANUP; 1550 } 1551 1552 if (imu->is_kbuf) 1553 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1554 1555 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1556 } 1557 1558 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, 1559 const struct iovec __user *uvec, size_t uvec_segs) 1560 { 1561 struct iovec *iov; 1562 int iovec_off, ret; 1563 void *res; 1564 1565 if (uvec_segs > iv->nr) { 1566 ret = io_vec_realloc(iv, uvec_segs); 1567 if (ret) 1568 return ret; 1569 req->flags |= REQ_F_NEED_CLEANUP; 1570 } 1571 1572 /* pad iovec to the right */ 1573 iovec_off = iv->nr - uvec_segs; 1574 iov = iv->iovec + iovec_off; 1575 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, 1576 io_is_compat(req->ctx)); 1577 if (IS_ERR(res)) 1578 return PTR_ERR(res); 1579 1580 req->flags |= REQ_F_IMPORT_BUFFER; 1581 return 0; 1582 } 1583