1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 #include <linux/io_uring/cmd.h> 13 14 #include <uapi/linux/io_uring.h> 15 16 #include "filetable.h" 17 #include "io_uring.h" 18 #include "openclose.h" 19 #include "rsrc.h" 20 #include "memmap.h" 21 #include "register.h" 22 23 struct io_rsrc_update { 24 struct file *file; 25 u64 arg; 26 u32 nr_args; 27 u32 offset; 28 }; 29 30 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 31 struct iovec *iov, struct page **last_hpage); 32 33 /* only define max */ 34 #define IORING_MAX_FIXED_FILES (1U << 20) 35 #define IORING_MAX_REG_BUFFERS (1U << 14) 36 37 #define IO_CACHED_BVECS_SEGS 32 38 39 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 40 { 41 unsigned long page_limit, cur_pages, new_pages; 42 43 if (!nr_pages) 44 return 0; 45 46 /* Don't allow more pages than we can safely lock */ 47 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 48 49 cur_pages = atomic_long_read(&user->locked_vm); 50 do { 51 new_pages = cur_pages + nr_pages; 52 if (new_pages > page_limit) 53 return -ENOMEM; 54 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 55 &cur_pages, new_pages)); 56 return 0; 57 } 58 59 void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, 60 unsigned long nr_pages) 61 { 62 if (user) 63 __io_unaccount_mem(user, nr_pages); 64 65 if (mm_account) 66 atomic64_sub(nr_pages, &mm_account->pinned_vm); 67 } 68 69 int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, 70 unsigned long nr_pages) 71 { 72 int ret; 73 74 if (user) { 75 ret = __io_account_mem(user, nr_pages); 76 if (ret) 77 return ret; 78 } 79 80 if (mm_account) 81 atomic64_add(nr_pages, &mm_account->pinned_vm); 82 83 return 0; 84 } 85 86 int io_validate_user_buf_range(u64 uaddr, u64 ulen) 87 { 88 unsigned long tmp, base = (unsigned long)uaddr; 89 unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); 90 91 /* arbitrary limit, but we need something */ 92 if (ulen > SZ_1G || !ulen) 93 return -EFAULT; 94 if (check_add_overflow(base, acct_len, &tmp)) 95 return -EOVERFLOW; 96 return 0; 97 } 98 99 static int io_buffer_validate(struct iovec *iov) 100 { 101 /* 102 * Don't impose further limits on the size and buffer 103 * constraints here, we'll -EINVAL later when IO is 104 * submitted if they are wrong. 105 */ 106 if (!iov->iov_base) 107 return iov->iov_len ? -EFAULT : 0; 108 109 return io_validate_user_buf_range((unsigned long)iov->iov_base, 110 iov->iov_len); 111 } 112 113 static void io_release_ubuf(void *priv) 114 { 115 struct io_mapped_ubuf *imu = priv; 116 unsigned int i; 117 118 for (i = 0; i < imu->nr_bvecs; i++) { 119 struct folio *folio = page_folio(imu->bvec[i].bv_page); 120 121 unpin_user_folio(folio, 1); 122 } 123 } 124 125 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 126 int nr_bvecs) 127 { 128 if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 129 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 130 return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), 131 GFP_KERNEL); 132 } 133 134 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 135 { 136 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 137 io_cache_free(&ctx->imu_cache, imu); 138 else 139 kvfree(imu); 140 } 141 142 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 143 { 144 if (unlikely(refcount_read(&imu->refs) > 1)) { 145 if (!refcount_dec_and_test(&imu->refs)) 146 return; 147 } 148 149 if (imu->acct_pages) 150 io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages); 151 imu->release(imu->priv); 152 io_free_imu(ctx, imu); 153 } 154 155 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 156 { 157 struct io_rsrc_node *node; 158 159 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 160 if (node) { 161 node->type = type; 162 node->refs = 1; 163 node->tag = 0; 164 node->file_ptr = 0; 165 } 166 return node; 167 } 168 169 bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 170 { 171 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 172 IO_CACHED_BVECS_SEGS); 173 const int node_size = sizeof(struct io_rsrc_node); 174 bool ret; 175 176 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 177 node_size, 0); 178 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 179 imu_cache_size, 0); 180 return ret; 181 } 182 183 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 184 { 185 io_alloc_cache_free(&ctx->node_cache, kfree); 186 io_alloc_cache_free(&ctx->imu_cache, kfree); 187 } 188 189 static void io_clear_table_tags(struct io_rsrc_data *data) 190 { 191 int i; 192 193 for (i = 0; i < data->nr; i++) { 194 struct io_rsrc_node *node = data->nodes[i]; 195 196 if (node) 197 node->tag = 0; 198 } 199 } 200 201 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 202 struct io_rsrc_data *data) 203 { 204 if (!data->nr) 205 return; 206 while (data->nr--) { 207 if (data->nodes[data->nr]) 208 io_put_rsrc_node(ctx, data->nodes[data->nr]); 209 } 210 kvfree(data->nodes); 211 data->nodes = NULL; 212 data->nr = 0; 213 } 214 215 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 216 { 217 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), 218 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 219 if (data->nodes) { 220 data->nr = nr; 221 return 0; 222 } 223 return -ENOMEM; 224 } 225 226 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 227 struct io_uring_rsrc_update2 *up, 228 unsigned nr_args) 229 { 230 u64 __user *tags = u64_to_user_ptr(up->tags); 231 __s32 __user *fds = u64_to_user_ptr(up->data); 232 int fd, i, err = 0; 233 unsigned int done; 234 235 if (!ctx->file_table.data.nr) 236 return -ENXIO; 237 if (up->offset + nr_args > ctx->file_table.data.nr) 238 return -EINVAL; 239 240 for (done = 0; done < nr_args; done++) { 241 u64 tag = 0; 242 243 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 244 copy_from_user(&fd, &fds[done], sizeof(fd))) { 245 err = -EFAULT; 246 break; 247 } 248 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 249 err = -EINVAL; 250 break; 251 } 252 if (fd == IORING_REGISTER_FILES_SKIP) 253 continue; 254 255 i = up->offset + done; 256 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 257 io_file_bitmap_clear(&ctx->file_table, i); 258 259 if (fd != -1) { 260 struct file *file = fget(fd); 261 struct io_rsrc_node *node; 262 263 if (!file) { 264 err = -EBADF; 265 break; 266 } 267 /* 268 * Don't allow io_uring instances to be registered. 269 */ 270 if (io_is_uring_fops(file)) { 271 fput(file); 272 err = -EBADF; 273 break; 274 } 275 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 276 if (!node) { 277 err = -ENOMEM; 278 fput(file); 279 break; 280 } 281 ctx->file_table.data.nodes[i] = node; 282 if (tag) 283 node->tag = tag; 284 io_fixed_file_set(node, file); 285 io_file_bitmap_set(&ctx->file_table, i); 286 } 287 } 288 return done ? done : err; 289 } 290 291 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 292 struct io_uring_rsrc_update2 *up, 293 unsigned int nr_args) 294 { 295 u64 __user *tags = u64_to_user_ptr(up->tags); 296 struct iovec fast_iov, *iov; 297 struct page *last_hpage = NULL; 298 struct iovec __user *uvec; 299 u64 user_data = up->data; 300 __u32 done; 301 int i, err; 302 303 if (!ctx->buf_table.nr) 304 return -ENXIO; 305 if (up->offset + nr_args > ctx->buf_table.nr) 306 return -EINVAL; 307 308 for (done = 0; done < nr_args; done++) { 309 struct io_rsrc_node *node; 310 u64 tag = 0; 311 312 uvec = u64_to_user_ptr(user_data); 313 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 314 if (IS_ERR(iov)) { 315 err = PTR_ERR(iov); 316 break; 317 } 318 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 319 err = -EFAULT; 320 break; 321 } 322 err = io_buffer_validate(iov); 323 if (err) 324 break; 325 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 326 if (IS_ERR(node)) { 327 err = PTR_ERR(node); 328 break; 329 } 330 if (tag) { 331 if (!node) { 332 err = -EINVAL; 333 break; 334 } 335 node->tag = tag; 336 } 337 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 338 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 339 ctx->buf_table.nodes[i] = node; 340 if (ctx->compat) 341 user_data += sizeof(struct compat_iovec); 342 else 343 user_data += sizeof(struct iovec); 344 } 345 return done ? done : err; 346 } 347 348 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 349 struct io_uring_rsrc_update2 *up, 350 unsigned nr_args) 351 { 352 __u32 tmp; 353 354 lockdep_assert_held(&ctx->uring_lock); 355 356 if (check_add_overflow(up->offset, nr_args, &tmp)) 357 return -EOVERFLOW; 358 359 switch (type) { 360 case IORING_RSRC_FILE: 361 return __io_sqe_files_update(ctx, up, nr_args); 362 case IORING_RSRC_BUFFER: 363 return __io_sqe_buffers_update(ctx, up, nr_args); 364 } 365 return -EINVAL; 366 } 367 368 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 369 unsigned nr_args) 370 { 371 struct io_uring_rsrc_update2 up; 372 373 if (!nr_args) 374 return -EINVAL; 375 memset(&up, 0, sizeof(up)); 376 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 377 return -EFAULT; 378 if (up.resv || up.resv2) 379 return -EINVAL; 380 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 381 } 382 383 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 384 unsigned size, unsigned type) 385 { 386 struct io_uring_rsrc_update2 up; 387 388 if (size != sizeof(up)) 389 return -EINVAL; 390 if (copy_from_user(&up, arg, sizeof(up))) 391 return -EFAULT; 392 if (!up.nr || up.resv || up.resv2) 393 return -EINVAL; 394 return __io_register_rsrc_update(ctx, type, &up, up.nr); 395 } 396 397 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 398 unsigned int size, unsigned int type) 399 { 400 struct io_uring_rsrc_register rr; 401 402 /* keep it extendible */ 403 if (size != sizeof(rr)) 404 return -EINVAL; 405 406 memset(&rr, 0, sizeof(rr)); 407 if (copy_from_user(&rr, arg, size)) 408 return -EFAULT; 409 if (!rr.nr || rr.resv2) 410 return -EINVAL; 411 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 412 return -EINVAL; 413 414 switch (type) { 415 case IORING_RSRC_FILE: 416 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 417 break; 418 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 419 rr.nr, u64_to_user_ptr(rr.tags)); 420 case IORING_RSRC_BUFFER: 421 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 422 break; 423 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 424 rr.nr, u64_to_user_ptr(rr.tags)); 425 } 426 return -EINVAL; 427 } 428 429 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 430 { 431 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 432 433 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 434 return -EINVAL; 435 if (sqe->rw_flags || sqe->splice_fd_in) 436 return -EINVAL; 437 438 up->offset = READ_ONCE(sqe->off); 439 up->nr_args = READ_ONCE(sqe->len); 440 if (!up->nr_args) 441 return -EINVAL; 442 up->arg = READ_ONCE(sqe->addr); 443 return 0; 444 } 445 446 static int io_files_update_with_index_alloc(struct io_kiocb *req, 447 unsigned int issue_flags) 448 { 449 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 450 __s32 __user *fds = u64_to_user_ptr(up->arg); 451 unsigned int done; 452 struct file *file; 453 int ret, fd; 454 455 if (!req->ctx->file_table.data.nr) 456 return -ENXIO; 457 458 for (done = 0; done < up->nr_args; done++) { 459 if (get_user(fd, &fds[done])) { 460 ret = -EFAULT; 461 break; 462 } 463 464 file = fget(fd); 465 if (!file) { 466 ret = -EBADF; 467 break; 468 } 469 ret = io_fixed_fd_install(req, issue_flags, file, 470 IORING_FILE_INDEX_ALLOC); 471 if (ret < 0) 472 break; 473 if (put_user(ret, &fds[done])) { 474 __io_close_fixed(req->ctx, issue_flags, ret); 475 ret = -EFAULT; 476 break; 477 } 478 } 479 480 if (done) 481 return done; 482 return ret; 483 } 484 485 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 486 { 487 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 488 struct io_ring_ctx *ctx = req->ctx; 489 struct io_uring_rsrc_update2 up2; 490 int ret; 491 492 up2.offset = up->offset; 493 up2.data = up->arg; 494 up2.nr = 0; 495 up2.tags = 0; 496 up2.resv = 0; 497 up2.resv2 = 0; 498 499 if (up->offset == IORING_FILE_INDEX_ALLOC) { 500 ret = io_files_update_with_index_alloc(req, issue_flags); 501 } else { 502 io_ring_submit_lock(ctx, issue_flags); 503 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 504 &up2, up->nr_args); 505 io_ring_submit_unlock(ctx, issue_flags); 506 } 507 508 if (ret < 0) 509 req_set_fail(req); 510 io_req_set_res(req, ret, 0); 511 return IOU_COMPLETE; 512 } 513 514 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 515 { 516 if (node->tag) 517 io_post_aux_cqe(ctx, node->tag, 0, 0); 518 519 switch (node->type) { 520 case IORING_RSRC_FILE: 521 fput(io_slot_file(node)); 522 break; 523 case IORING_RSRC_BUFFER: 524 io_buffer_unmap(ctx, node->buf); 525 break; 526 default: 527 WARN_ON_ONCE(1); 528 break; 529 } 530 531 io_cache_free(&ctx->node_cache, node); 532 } 533 534 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 535 { 536 if (!ctx->file_table.data.nr) 537 return -ENXIO; 538 539 io_free_file_tables(ctx, &ctx->file_table); 540 io_file_table_set_alloc_range(ctx, 0, 0); 541 return 0; 542 } 543 544 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 545 unsigned nr_args, u64 __user *tags) 546 { 547 __s32 __user *fds = (__s32 __user *) arg; 548 struct file *file; 549 int fd, ret; 550 unsigned i; 551 552 if (ctx->file_table.data.nr) 553 return -EBUSY; 554 if (!nr_args) 555 return -EINVAL; 556 if (nr_args > IORING_MAX_FIXED_FILES) 557 return -EMFILE; 558 if (nr_args > rlimit(RLIMIT_NOFILE)) 559 return -EMFILE; 560 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 561 return -ENOMEM; 562 563 for (i = 0; i < nr_args; i++) { 564 struct io_rsrc_node *node; 565 u64 tag = 0; 566 567 ret = -EFAULT; 568 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 569 goto fail; 570 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 571 goto fail; 572 /* allow sparse sets */ 573 if (!fds || fd == -1) { 574 ret = -EINVAL; 575 if (tag) 576 goto fail; 577 continue; 578 } 579 580 file = fget(fd); 581 ret = -EBADF; 582 if (unlikely(!file)) 583 goto fail; 584 585 /* 586 * Don't allow io_uring instances to be registered. 587 */ 588 if (io_is_uring_fops(file)) { 589 fput(file); 590 goto fail; 591 } 592 ret = -ENOMEM; 593 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 594 if (!node) { 595 fput(file); 596 goto fail; 597 } 598 if (tag) 599 node->tag = tag; 600 ctx->file_table.data.nodes[i] = node; 601 io_fixed_file_set(node, file); 602 io_file_bitmap_set(&ctx->file_table, i); 603 } 604 605 /* default it to the whole table */ 606 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 607 return 0; 608 fail: 609 io_clear_table_tags(&ctx->file_table.data); 610 io_sqe_files_unregister(ctx); 611 return ret; 612 } 613 614 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 615 { 616 if (!ctx->buf_table.nr) 617 return -ENXIO; 618 io_rsrc_data_free(ctx, &ctx->buf_table); 619 return 0; 620 } 621 622 /* 623 * Not super efficient, but this is just a registration time. And we do cache 624 * the last compound head, so generally we'll only do a full search if we don't 625 * match that one. 626 * 627 * We check if the given compound head page has already been accounted, to 628 * avoid double accounting it. This allows us to account the full size of the 629 * page, not just the constituent pages of a huge page. 630 */ 631 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 632 int nr_pages, struct page *hpage) 633 { 634 int i, j; 635 636 /* check current page array */ 637 for (i = 0; i < nr_pages; i++) { 638 if (!PageCompound(pages[i])) 639 continue; 640 if (compound_head(pages[i]) == hpage) 641 return true; 642 } 643 644 /* check previously registered pages */ 645 for (i = 0; i < ctx->buf_table.nr; i++) { 646 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 647 struct io_mapped_ubuf *imu; 648 649 if (!node) 650 continue; 651 imu = node->buf; 652 for (j = 0; j < imu->nr_bvecs; j++) { 653 if (!PageCompound(imu->bvec[j].bv_page)) 654 continue; 655 if (compound_head(imu->bvec[j].bv_page) == hpage) 656 return true; 657 } 658 } 659 660 return false; 661 } 662 663 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 664 int nr_pages, struct io_mapped_ubuf *imu, 665 struct page **last_hpage) 666 { 667 int i, ret; 668 669 imu->acct_pages = 0; 670 for (i = 0; i < nr_pages; i++) { 671 if (!PageCompound(pages[i])) { 672 imu->acct_pages++; 673 } else { 674 struct page *hpage; 675 676 hpage = compound_head(pages[i]); 677 if (hpage == *last_hpage) 678 continue; 679 *last_hpage = hpage; 680 if (headpage_already_acct(ctx, pages, i, hpage)) 681 continue; 682 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 683 } 684 } 685 686 if (!imu->acct_pages) 687 return 0; 688 689 ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages); 690 if (ret) 691 imu->acct_pages = 0; 692 return ret; 693 } 694 695 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 696 struct io_imu_folio_data *data) 697 { 698 struct page **page_array = *pages, **new_array = NULL; 699 unsigned nr_pages_left = *nr_pages; 700 unsigned nr_folios = data->nr_folios; 701 unsigned i, j; 702 703 /* Store head pages only*/ 704 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); 705 if (!new_array) 706 return false; 707 708 for (i = 0, j = 0; i < nr_folios; i++) { 709 struct page *p = compound_head(page_array[j]); 710 struct folio *folio = page_folio(p); 711 unsigned int nr; 712 713 WARN_ON_ONCE(i > 0 && p != page_array[j]); 714 715 nr = i ? data->nr_pages_mid : data->nr_pages_head; 716 nr = min(nr, nr_pages_left); 717 /* Drop all but one ref, the entire folio will remain pinned. */ 718 if (nr > 1) 719 unpin_user_folio(folio, nr - 1); 720 j += nr; 721 nr_pages_left -= nr; 722 new_array[i] = p; 723 } 724 725 WARN_ON_ONCE(j != *nr_pages); 726 727 kvfree(page_array); 728 *pages = new_array; 729 *nr_pages = nr_folios; 730 return true; 731 } 732 733 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 734 struct io_imu_folio_data *data) 735 { 736 struct folio *folio = page_folio(page_array[0]); 737 unsigned int count = 1, nr_folios = 1; 738 int i; 739 740 data->nr_pages_mid = folio_nr_pages(folio); 741 data->folio_shift = folio_shift(folio); 742 data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); 743 744 /* 745 * Check if pages are contiguous inside a folio, and all folios have 746 * the same page count except for the head and tail. 747 */ 748 for (i = 1; i < nr_pages; i++) { 749 if (page_folio(page_array[i]) == folio && 750 page_array[i] == page_array[i-1] + 1) { 751 count++; 752 continue; 753 } 754 755 if (nr_folios == 1) { 756 if (folio_page_idx(folio, page_array[i-1]) != 757 data->nr_pages_mid - 1) 758 return false; 759 760 data->nr_pages_head = count; 761 } else if (count != data->nr_pages_mid) { 762 return false; 763 } 764 765 folio = page_folio(page_array[i]); 766 if (folio_size(folio) != (1UL << data->folio_shift) || 767 folio_page_idx(folio, page_array[i]) != 0) 768 return false; 769 770 count = 1; 771 nr_folios++; 772 } 773 if (nr_folios == 1) 774 data->nr_pages_head = count; 775 776 data->nr_folios = nr_folios; 777 return true; 778 } 779 780 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 781 struct iovec *iov, 782 struct page **last_hpage) 783 { 784 struct io_mapped_ubuf *imu = NULL; 785 struct page **pages = NULL; 786 struct io_rsrc_node *node; 787 unsigned long off; 788 size_t size; 789 int ret, nr_pages, i; 790 struct io_imu_folio_data data; 791 bool coalesced = false; 792 793 if (!iov->iov_base) 794 return NULL; 795 796 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 797 if (!node) 798 return ERR_PTR(-ENOMEM); 799 800 ret = -ENOMEM; 801 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 802 &nr_pages); 803 if (IS_ERR(pages)) { 804 ret = PTR_ERR(pages); 805 pages = NULL; 806 goto done; 807 } 808 809 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 810 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 811 if (data.nr_pages_mid != 1) 812 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 813 } 814 815 imu = io_alloc_imu(ctx, nr_pages); 816 if (!imu) 817 goto done; 818 819 imu->nr_bvecs = nr_pages; 820 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 821 if (ret) 822 goto done; 823 824 size = iov->iov_len; 825 /* store original address for later verification */ 826 imu->ubuf = (unsigned long) iov->iov_base; 827 imu->len = iov->iov_len; 828 imu->folio_shift = PAGE_SHIFT; 829 imu->release = io_release_ubuf; 830 imu->priv = imu; 831 imu->is_kbuf = false; 832 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 833 if (coalesced) 834 imu->folio_shift = data.folio_shift; 835 refcount_set(&imu->refs, 1); 836 837 off = (unsigned long)iov->iov_base & ~PAGE_MASK; 838 if (coalesced) 839 off += data.first_folio_page_idx << PAGE_SHIFT; 840 841 node->buf = imu; 842 ret = 0; 843 844 for (i = 0; i < nr_pages; i++) { 845 size_t vec_len; 846 847 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 848 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 849 off = 0; 850 size -= vec_len; 851 } 852 done: 853 if (ret) { 854 if (imu) 855 io_free_imu(ctx, imu); 856 if (pages) { 857 for (i = 0; i < nr_pages; i++) 858 unpin_user_folio(page_folio(pages[i]), 1); 859 } 860 io_cache_free(&ctx->node_cache, node); 861 node = ERR_PTR(ret); 862 } 863 kvfree(pages); 864 return node; 865 } 866 867 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 868 unsigned int nr_args, u64 __user *tags) 869 { 870 struct page *last_hpage = NULL; 871 struct io_rsrc_data data; 872 struct iovec fast_iov, *iov = &fast_iov; 873 const struct iovec __user *uvec; 874 int i, ret; 875 876 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 877 878 if (ctx->buf_table.nr) 879 return -EBUSY; 880 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 881 return -EINVAL; 882 ret = io_rsrc_data_alloc(&data, nr_args); 883 if (ret) 884 return ret; 885 886 if (!arg) 887 memset(iov, 0, sizeof(*iov)); 888 889 for (i = 0; i < nr_args; i++) { 890 struct io_rsrc_node *node; 891 u64 tag = 0; 892 893 if (arg) { 894 uvec = (struct iovec __user *) arg; 895 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 896 if (IS_ERR(iov)) { 897 ret = PTR_ERR(iov); 898 break; 899 } 900 ret = io_buffer_validate(iov); 901 if (ret) 902 break; 903 if (ctx->compat) 904 arg += sizeof(struct compat_iovec); 905 else 906 arg += sizeof(struct iovec); 907 } 908 909 if (tags) { 910 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 911 ret = -EFAULT; 912 break; 913 } 914 } 915 916 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 917 if (IS_ERR(node)) { 918 ret = PTR_ERR(node); 919 break; 920 } 921 if (tag) { 922 if (!node) { 923 ret = -EINVAL; 924 break; 925 } 926 node->tag = tag; 927 } 928 data.nodes[i] = node; 929 } 930 931 ctx->buf_table = data; 932 if (ret) { 933 io_clear_table_tags(&ctx->buf_table); 934 io_sqe_buffers_unregister(ctx); 935 } 936 return ret; 937 } 938 939 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 940 void (*release)(void *), unsigned int index, 941 unsigned int issue_flags) 942 { 943 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 944 struct io_rsrc_data *data = &ctx->buf_table; 945 struct req_iterator rq_iter; 946 struct io_mapped_ubuf *imu; 947 struct io_rsrc_node *node; 948 struct bio_vec bv; 949 unsigned int nr_bvecs = 0; 950 int ret = 0; 951 952 io_ring_submit_lock(ctx, issue_flags); 953 if (index >= data->nr) { 954 ret = -EINVAL; 955 goto unlock; 956 } 957 index = array_index_nospec(index, data->nr); 958 959 if (data->nodes[index]) { 960 ret = -EBUSY; 961 goto unlock; 962 } 963 964 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 965 if (!node) { 966 ret = -ENOMEM; 967 goto unlock; 968 } 969 970 /* 971 * blk_rq_nr_phys_segments() may overestimate the number of bvecs 972 * but avoids needing to iterate over the bvecs 973 */ 974 imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); 975 if (!imu) { 976 kfree(node); 977 ret = -ENOMEM; 978 goto unlock; 979 } 980 981 imu->ubuf = 0; 982 imu->len = blk_rq_bytes(rq); 983 imu->acct_pages = 0; 984 imu->folio_shift = PAGE_SHIFT; 985 refcount_set(&imu->refs, 1); 986 imu->release = release; 987 imu->priv = rq; 988 imu->is_kbuf = true; 989 imu->dir = 1 << rq_data_dir(rq); 990 991 rq_for_each_bvec(bv, rq, rq_iter) 992 imu->bvec[nr_bvecs++] = bv; 993 imu->nr_bvecs = nr_bvecs; 994 995 node->buf = imu; 996 data->nodes[index] = node; 997 unlock: 998 io_ring_submit_unlock(ctx, issue_flags); 999 return ret; 1000 } 1001 EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 1002 1003 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 1004 unsigned int issue_flags) 1005 { 1006 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 1007 struct io_rsrc_data *data = &ctx->buf_table; 1008 struct io_rsrc_node *node; 1009 int ret = 0; 1010 1011 io_ring_submit_lock(ctx, issue_flags); 1012 if (index >= data->nr) { 1013 ret = -EINVAL; 1014 goto unlock; 1015 } 1016 index = array_index_nospec(index, data->nr); 1017 1018 node = data->nodes[index]; 1019 if (!node) { 1020 ret = -EINVAL; 1021 goto unlock; 1022 } 1023 if (!node->buf->is_kbuf) { 1024 ret = -EBUSY; 1025 goto unlock; 1026 } 1027 1028 io_put_rsrc_node(ctx, node); 1029 data->nodes[index] = NULL; 1030 unlock: 1031 io_ring_submit_unlock(ctx, issue_flags); 1032 return ret; 1033 } 1034 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 1035 1036 static int validate_fixed_range(u64 buf_addr, size_t len, 1037 const struct io_mapped_ubuf *imu) 1038 { 1039 u64 buf_end; 1040 1041 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1042 return -EFAULT; 1043 /* not inside the mapped region */ 1044 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1045 return -EFAULT; 1046 if (unlikely(len > MAX_RW_COUNT)) 1047 return -EFAULT; 1048 return 0; 1049 } 1050 1051 static int io_import_kbuf(int ddir, struct iov_iter *iter, 1052 struct io_mapped_ubuf *imu, size_t len, size_t offset) 1053 { 1054 size_t count = len + offset; 1055 1056 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); 1057 iov_iter_advance(iter, offset); 1058 1059 if (count < imu->len) { 1060 const struct bio_vec *bvec = iter->bvec; 1061 1062 len += iter->iov_offset; 1063 while (len > bvec->bv_len) { 1064 len -= bvec->bv_len; 1065 bvec++; 1066 } 1067 iter->nr_segs = 1 + bvec - iter->bvec; 1068 } 1069 return 0; 1070 } 1071 1072 static int io_import_fixed(int ddir, struct iov_iter *iter, 1073 struct io_mapped_ubuf *imu, 1074 u64 buf_addr, size_t len) 1075 { 1076 const struct bio_vec *bvec; 1077 size_t folio_mask; 1078 unsigned nr_segs; 1079 size_t offset; 1080 int ret; 1081 1082 ret = validate_fixed_range(buf_addr, len, imu); 1083 if (unlikely(ret)) 1084 return ret; 1085 if (!(imu->dir & (1 << ddir))) 1086 return -EFAULT; 1087 1088 offset = buf_addr - imu->ubuf; 1089 1090 if (imu->is_kbuf) 1091 return io_import_kbuf(ddir, iter, imu, len, offset); 1092 1093 /* 1094 * Don't use iov_iter_advance() here, as it's really slow for 1095 * using the latter parts of a big fixed buffer - it iterates 1096 * over each segment manually. We can cheat a bit here for user 1097 * registered nodes, because we know that: 1098 * 1099 * 1) it's a BVEC iter, we set it up 1100 * 2) all bvecs are the same in size, except potentially the 1101 * first and last bvec 1102 */ 1103 folio_mask = (1UL << imu->folio_shift) - 1; 1104 bvec = imu->bvec; 1105 if (offset >= bvec->bv_len) { 1106 unsigned long seg_skip; 1107 1108 /* skip first vec */ 1109 offset -= bvec->bv_len; 1110 seg_skip = 1 + (offset >> imu->folio_shift); 1111 bvec += seg_skip; 1112 offset &= folio_mask; 1113 } 1114 nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; 1115 iov_iter_bvec(iter, ddir, bvec, nr_segs, len); 1116 iter->iov_offset = offset; 1117 return 0; 1118 } 1119 1120 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 1121 unsigned issue_flags) 1122 { 1123 struct io_ring_ctx *ctx = req->ctx; 1124 struct io_rsrc_node *node; 1125 1126 if (req->flags & REQ_F_BUF_NODE) 1127 return req->buf_node; 1128 req->flags |= REQ_F_BUF_NODE; 1129 1130 io_ring_submit_lock(ctx, issue_flags); 1131 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 1132 if (node) { 1133 node->refs++; 1134 req->buf_node = node; 1135 io_ring_submit_unlock(ctx, issue_flags); 1136 return node; 1137 } 1138 req->flags &= ~REQ_F_BUF_NODE; 1139 io_ring_submit_unlock(ctx, issue_flags); 1140 return NULL; 1141 } 1142 1143 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 1144 u64 buf_addr, size_t len, int ddir, 1145 unsigned issue_flags) 1146 { 1147 struct io_rsrc_node *node; 1148 1149 node = io_find_buf_node(req, issue_flags); 1150 if (!node) 1151 return -EFAULT; 1152 return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1153 } 1154 1155 /* Lock two rings at once. The rings must be different! */ 1156 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 1157 { 1158 if (ctx1 > ctx2) 1159 swap(ctx1, ctx2); 1160 mutex_lock(&ctx1->uring_lock); 1161 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 1162 } 1163 1164 /* Both rings are locked by the caller. */ 1165 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 1166 struct io_uring_clone_buffers *arg) 1167 { 1168 struct io_rsrc_data data; 1169 int i, ret, off, nr; 1170 unsigned int nbufs; 1171 1172 lockdep_assert_held(&ctx->uring_lock); 1173 lockdep_assert_held(&src_ctx->uring_lock); 1174 1175 /* 1176 * Accounting state is shared between the two rings; that only works if 1177 * both rings are accounted towards the same counters. 1178 */ 1179 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 1180 return -EINVAL; 1181 1182 /* if offsets are given, must have nr specified too */ 1183 if (!arg->nr && (arg->dst_off || arg->src_off)) 1184 return -EINVAL; 1185 /* not allowed unless REPLACE is set */ 1186 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 1187 return -EBUSY; 1188 1189 nbufs = src_ctx->buf_table.nr; 1190 if (!nbufs) 1191 return -ENXIO; 1192 if (!arg->nr) 1193 arg->nr = nbufs; 1194 else if (arg->nr > nbufs) 1195 return -EINVAL; 1196 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1197 return -EINVAL; 1198 if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs) 1199 return -EOVERFLOW; 1200 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1201 return -EOVERFLOW; 1202 if (nbufs > IORING_MAX_REG_BUFFERS) 1203 return -EINVAL; 1204 1205 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 1206 if (ret) 1207 return ret; 1208 1209 /* Copy original dst nodes from before the cloned range */ 1210 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1211 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 1212 1213 if (node) { 1214 data.nodes[i] = node; 1215 node->refs++; 1216 } 1217 } 1218 1219 off = arg->dst_off; 1220 i = arg->src_off; 1221 nr = arg->nr; 1222 while (nr--) { 1223 struct io_rsrc_node *dst_node, *src_node; 1224 1225 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1226 if (!src_node) { 1227 dst_node = NULL; 1228 } else { 1229 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1230 if (!dst_node) { 1231 io_rsrc_data_free(ctx, &data); 1232 return -ENOMEM; 1233 } 1234 1235 refcount_inc(&src_node->buf->refs); 1236 dst_node->buf = src_node->buf; 1237 } 1238 data.nodes[off++] = dst_node; 1239 i++; 1240 } 1241 1242 /* Copy original dst nodes from after the cloned range */ 1243 for (i = nbufs; i < ctx->buf_table.nr; i++) { 1244 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 1245 1246 if (node) { 1247 data.nodes[i] = node; 1248 node->refs++; 1249 } 1250 } 1251 1252 /* 1253 * If asked for replace, put the old table. data->nodes[] holds both 1254 * old and new nodes at this point. 1255 */ 1256 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1257 io_rsrc_data_free(ctx, &ctx->buf_table); 1258 1259 /* 1260 * ctx->buf_table must be empty now - either the contents are being 1261 * replaced and we just freed the table, or the contents are being 1262 * copied to a ring that does not have buffers yet (checked at function 1263 * entry). 1264 */ 1265 WARN_ON_ONCE(ctx->buf_table.nr); 1266 ctx->buf_table = data; 1267 return 0; 1268 } 1269 1270 /* 1271 * Copy the registered buffers from the source ring whose file descriptor 1272 * is given in the src_fd to the current ring. This is identical to registering 1273 * the buffers with ctx, except faster as mappings already exist. 1274 * 1275 * Since the memory is already accounted once, don't account it again. 1276 */ 1277 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1278 { 1279 struct io_uring_clone_buffers buf; 1280 struct io_ring_ctx *src_ctx; 1281 bool registered_src; 1282 struct file *file; 1283 int ret; 1284 1285 if (copy_from_user(&buf, arg, sizeof(buf))) 1286 return -EFAULT; 1287 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1288 return -EINVAL; 1289 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1290 return -EBUSY; 1291 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1292 return -EINVAL; 1293 1294 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1295 file = io_uring_register_get_file(buf.src_fd, registered_src); 1296 if (IS_ERR(file)) 1297 return PTR_ERR(file); 1298 1299 src_ctx = file->private_data; 1300 if (src_ctx != ctx) { 1301 mutex_unlock(&ctx->uring_lock); 1302 lock_two_rings(ctx, src_ctx); 1303 1304 if (src_ctx->submitter_task && 1305 src_ctx->submitter_task != current) { 1306 ret = -EEXIST; 1307 goto out; 1308 } 1309 } 1310 1311 ret = io_clone_buffers(ctx, src_ctx, &buf); 1312 1313 out: 1314 if (src_ctx != ctx) 1315 mutex_unlock(&src_ctx->uring_lock); 1316 1317 fput(file); 1318 return ret; 1319 } 1320 1321 void io_vec_free(struct iou_vec *iv) 1322 { 1323 if (!iv->iovec) 1324 return; 1325 kfree(iv->iovec); 1326 iv->iovec = NULL; 1327 iv->nr = 0; 1328 } 1329 1330 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1331 { 1332 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1333 struct iovec *iov; 1334 1335 iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); 1336 if (!iov) 1337 return -ENOMEM; 1338 1339 io_vec_free(iv); 1340 iv->iovec = iov; 1341 iv->nr = nr_entries; 1342 return 0; 1343 } 1344 1345 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, 1346 struct io_mapped_ubuf *imu, 1347 struct iovec *iovec, unsigned nr_iovs, 1348 struct iou_vec *vec) 1349 { 1350 unsigned long folio_size = 1 << imu->folio_shift; 1351 unsigned long folio_mask = folio_size - 1; 1352 struct bio_vec *res_bvec = vec->bvec; 1353 size_t total_len = 0; 1354 unsigned bvec_idx = 0; 1355 unsigned iov_idx; 1356 1357 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1358 size_t iov_len = iovec[iov_idx].iov_len; 1359 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; 1360 struct bio_vec *src_bvec; 1361 size_t offset; 1362 int ret; 1363 1364 ret = validate_fixed_range(buf_addr, iov_len, imu); 1365 if (unlikely(ret)) 1366 return ret; 1367 1368 if (unlikely(!iov_len)) 1369 return -EFAULT; 1370 if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) 1371 return -EOVERFLOW; 1372 1373 offset = buf_addr - imu->ubuf; 1374 /* 1375 * Only the first bvec can have non zero bv_offset, account it 1376 * here and work with full folios below. 1377 */ 1378 offset += imu->bvec[0].bv_offset; 1379 1380 src_bvec = imu->bvec + (offset >> imu->folio_shift); 1381 offset &= folio_mask; 1382 1383 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { 1384 size_t seg_size = min_t(size_t, iov_len, 1385 folio_size - offset); 1386 1387 bvec_set_page(&res_bvec[bvec_idx], 1388 src_bvec->bv_page, seg_size, offset); 1389 iov_len -= seg_size; 1390 } 1391 } 1392 if (total_len > MAX_RW_COUNT) 1393 return -EINVAL; 1394 1395 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); 1396 return 0; 1397 } 1398 1399 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, 1400 struct io_mapped_ubuf *imu) 1401 { 1402 unsigned shift = imu->folio_shift; 1403 size_t max_segs = 0; 1404 unsigned i; 1405 1406 for (i = 0; i < nr_iovs; i++) { 1407 max_segs += (iov[i].iov_len >> shift) + 2; 1408 if (max_segs > INT_MAX) 1409 return -EOVERFLOW; 1410 } 1411 return max_segs; 1412 } 1413 1414 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, 1415 struct io_mapped_ubuf *imu, 1416 struct iovec *iovec, unsigned nr_iovs, 1417 struct iou_vec *vec) 1418 { 1419 const struct bio_vec *src_bvec = imu->bvec; 1420 struct bio_vec *res_bvec = vec->bvec; 1421 unsigned res_idx = 0; 1422 size_t total_len = 0; 1423 unsigned iov_idx; 1424 1425 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1426 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; 1427 size_t iov_len = iovec[iov_idx].iov_len; 1428 struct bvec_iter bi = { 1429 .bi_size = offset + iov_len, 1430 }; 1431 struct bio_vec bv; 1432 1433 bvec_iter_advance(src_bvec, &bi, offset); 1434 for_each_mp_bvec(bv, src_bvec, bi, bi) 1435 res_bvec[res_idx++] = bv; 1436 total_len += iov_len; 1437 } 1438 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); 1439 return 0; 1440 } 1441 1442 static int iov_kern_bvec_size(const struct iovec *iov, 1443 const struct io_mapped_ubuf *imu, 1444 unsigned int *nr_seg) 1445 { 1446 size_t offset = (size_t)(uintptr_t)iov->iov_base; 1447 const struct bio_vec *bvec = imu->bvec; 1448 int start = 0, i = 0; 1449 size_t off = 0; 1450 int ret; 1451 1452 ret = validate_fixed_range(offset, iov->iov_len, imu); 1453 if (unlikely(ret)) 1454 return ret; 1455 1456 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; 1457 off += bvec[i].bv_len, i++) { 1458 if (offset >= off && offset < off + bvec[i].bv_len) 1459 start = i; 1460 } 1461 *nr_seg = i - start; 1462 return 0; 1463 } 1464 1465 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, 1466 struct io_mapped_ubuf *imu, unsigned *nr_segs) 1467 { 1468 unsigned max_segs = 0; 1469 size_t total_len = 0; 1470 unsigned i; 1471 int ret; 1472 1473 *nr_segs = 0; 1474 for (i = 0; i < nr_iovs; i++) { 1475 if (unlikely(!iov[i].iov_len)) 1476 return -EFAULT; 1477 if (unlikely(check_add_overflow(total_len, iov[i].iov_len, 1478 &total_len))) 1479 return -EOVERFLOW; 1480 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); 1481 if (unlikely(ret)) 1482 return ret; 1483 *nr_segs += max_segs; 1484 } 1485 if (total_len > MAX_RW_COUNT) 1486 return -EINVAL; 1487 return 0; 1488 } 1489 1490 int io_import_reg_vec(int ddir, struct iov_iter *iter, 1491 struct io_kiocb *req, struct iou_vec *vec, 1492 unsigned nr_iovs, unsigned issue_flags) 1493 { 1494 struct io_rsrc_node *node; 1495 struct io_mapped_ubuf *imu; 1496 unsigned iovec_off; 1497 struct iovec *iov; 1498 unsigned nr_segs; 1499 1500 node = io_find_buf_node(req, issue_flags); 1501 if (!node) 1502 return -EFAULT; 1503 imu = node->buf; 1504 if (!(imu->dir & (1 << ddir))) 1505 return -EFAULT; 1506 1507 iovec_off = vec->nr - nr_iovs; 1508 iov = vec->iovec + iovec_off; 1509 1510 if (imu->is_kbuf) { 1511 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); 1512 1513 if (unlikely(ret)) 1514 return ret; 1515 } else { 1516 int ret = io_estimate_bvec_size(iov, nr_iovs, imu); 1517 1518 if (ret < 0) 1519 return ret; 1520 nr_segs = ret; 1521 } 1522 1523 if (sizeof(struct bio_vec) > sizeof(struct iovec)) { 1524 size_t bvec_bytes; 1525 1526 bvec_bytes = nr_segs * sizeof(struct bio_vec); 1527 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); 1528 nr_segs += nr_iovs; 1529 } 1530 1531 if (nr_segs > vec->nr) { 1532 struct iou_vec tmp_vec = {}; 1533 int ret; 1534 1535 ret = io_vec_realloc(&tmp_vec, nr_segs); 1536 if (ret) 1537 return ret; 1538 1539 iovec_off = tmp_vec.nr - nr_iovs; 1540 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); 1541 io_vec_free(vec); 1542 1543 *vec = tmp_vec; 1544 iov = vec->iovec + iovec_off; 1545 req->flags |= REQ_F_NEED_CLEANUP; 1546 } 1547 1548 if (imu->is_kbuf) 1549 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1550 1551 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1552 } 1553 1554 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, 1555 const struct iovec __user *uvec, size_t uvec_segs) 1556 { 1557 struct iovec *iov; 1558 int iovec_off, ret; 1559 void *res; 1560 1561 if (uvec_segs > iv->nr) { 1562 ret = io_vec_realloc(iv, uvec_segs); 1563 if (ret) 1564 return ret; 1565 req->flags |= REQ_F_NEED_CLEANUP; 1566 } 1567 1568 /* pad iovec to the right */ 1569 iovec_off = iv->nr - uvec_segs; 1570 iov = iv->iovec + iovec_off; 1571 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, 1572 io_is_compat(req->ctx)); 1573 if (IS_ERR(res)) 1574 return PTR_ERR(res); 1575 1576 req->flags |= REQ_F_IMPORT_BUFFER; 1577 return 0; 1578 } 1579