1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 #include <linux/io_uring/cmd.h> 13 14 #include <uapi/linux/io_uring.h> 15 16 #include "io_uring.h" 17 #include "openclose.h" 18 #include "rsrc.h" 19 #include "memmap.h" 20 #include "register.h" 21 22 struct io_rsrc_update { 23 struct file *file; 24 u64 arg; 25 u32 nr_args; 26 u32 offset; 27 }; 28 29 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 30 struct iovec *iov, struct page **last_hpage); 31 32 /* only define max */ 33 #define IORING_MAX_FIXED_FILES (1U << 20) 34 #define IORING_MAX_REG_BUFFERS (1U << 14) 35 36 #define IO_CACHED_BVECS_SEGS 32 37 38 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 39 { 40 unsigned long page_limit, cur_pages, new_pages; 41 42 if (!nr_pages) 43 return 0; 44 45 /* Don't allow more pages than we can safely lock */ 46 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 47 48 cur_pages = atomic_long_read(&user->locked_vm); 49 do { 50 new_pages = cur_pages + nr_pages; 51 if (new_pages > page_limit) 52 return -ENOMEM; 53 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 54 &cur_pages, new_pages)); 55 return 0; 56 } 57 58 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 59 { 60 if (ctx->user) 61 __io_unaccount_mem(ctx->user, nr_pages); 62 63 if (ctx->mm_account) 64 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 65 } 66 67 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 68 { 69 int ret; 70 71 if (ctx->user) { 72 ret = __io_account_mem(ctx->user, nr_pages); 73 if (ret) 74 return ret; 75 } 76 77 if (ctx->mm_account) 78 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 79 80 return 0; 81 } 82 83 int io_validate_user_buf_range(u64 uaddr, u64 ulen) 84 { 85 unsigned long tmp, base = (unsigned long)uaddr; 86 unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); 87 88 /* arbitrary limit, but we need something */ 89 if (ulen > SZ_1G || !ulen) 90 return -EFAULT; 91 if (check_add_overflow(base, acct_len, &tmp)) 92 return -EOVERFLOW; 93 return 0; 94 } 95 96 static int io_buffer_validate(struct iovec *iov) 97 { 98 /* 99 * Don't impose further limits on the size and buffer 100 * constraints here, we'll -EINVAL later when IO is 101 * submitted if they are wrong. 102 */ 103 if (!iov->iov_base) 104 return iov->iov_len ? -EFAULT : 0; 105 106 return io_validate_user_buf_range((unsigned long)iov->iov_base, 107 iov->iov_len); 108 } 109 110 static void io_release_ubuf(void *priv) 111 { 112 struct io_mapped_ubuf *imu = priv; 113 unsigned int i; 114 115 for (i = 0; i < imu->nr_bvecs; i++) 116 unpin_user_page(imu->bvec[i].bv_page); 117 } 118 119 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 120 int nr_bvecs) 121 { 122 if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 123 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 124 return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), 125 GFP_KERNEL); 126 } 127 128 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 129 { 130 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 131 io_cache_free(&ctx->imu_cache, imu); 132 else 133 kvfree(imu); 134 } 135 136 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 137 { 138 if (!refcount_dec_and_test(&imu->refs)) 139 return; 140 141 if (imu->acct_pages) 142 io_unaccount_mem(ctx, imu->acct_pages); 143 imu->release(imu->priv); 144 io_free_imu(ctx, imu); 145 } 146 147 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 148 { 149 struct io_rsrc_node *node; 150 151 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 152 if (node) { 153 node->type = type; 154 node->refs = 1; 155 node->tag = 0; 156 node->file_ptr = 0; 157 } 158 return node; 159 } 160 161 bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 162 { 163 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 164 IO_CACHED_BVECS_SEGS); 165 const int node_size = sizeof(struct io_rsrc_node); 166 bool ret; 167 168 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 169 node_size, 0); 170 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 171 imu_cache_size, 0); 172 return ret; 173 } 174 175 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 176 { 177 io_alloc_cache_free(&ctx->node_cache, kfree); 178 io_alloc_cache_free(&ctx->imu_cache, kfree); 179 } 180 181 static void io_clear_table_tags(struct io_rsrc_data *data) 182 { 183 int i; 184 185 for (i = 0; i < data->nr; i++) { 186 struct io_rsrc_node *node = data->nodes[i]; 187 188 if (node) 189 node->tag = 0; 190 } 191 } 192 193 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 194 struct io_rsrc_data *data) 195 { 196 if (!data->nr) 197 return; 198 while (data->nr--) { 199 if (data->nodes[data->nr]) 200 io_put_rsrc_node(ctx, data->nodes[data->nr]); 201 } 202 kvfree(data->nodes); 203 data->nodes = NULL; 204 data->nr = 0; 205 } 206 207 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 208 { 209 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), 210 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 211 if (data->nodes) { 212 data->nr = nr; 213 return 0; 214 } 215 return -ENOMEM; 216 } 217 218 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 219 struct io_uring_rsrc_update2 *up, 220 unsigned nr_args) 221 { 222 u64 __user *tags = u64_to_user_ptr(up->tags); 223 __s32 __user *fds = u64_to_user_ptr(up->data); 224 int fd, i, err = 0; 225 unsigned int done; 226 227 if (!ctx->file_table.data.nr) 228 return -ENXIO; 229 if (up->offset + nr_args > ctx->file_table.data.nr) 230 return -EINVAL; 231 232 for (done = 0; done < nr_args; done++) { 233 u64 tag = 0; 234 235 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 236 copy_from_user(&fd, &fds[done], sizeof(fd))) { 237 err = -EFAULT; 238 break; 239 } 240 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 241 err = -EINVAL; 242 break; 243 } 244 if (fd == IORING_REGISTER_FILES_SKIP) 245 continue; 246 247 i = up->offset + done; 248 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 249 io_file_bitmap_clear(&ctx->file_table, i); 250 251 if (fd != -1) { 252 struct file *file = fget(fd); 253 struct io_rsrc_node *node; 254 255 if (!file) { 256 err = -EBADF; 257 break; 258 } 259 /* 260 * Don't allow io_uring instances to be registered. 261 */ 262 if (io_is_uring_fops(file)) { 263 fput(file); 264 err = -EBADF; 265 break; 266 } 267 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 268 if (!node) { 269 err = -ENOMEM; 270 fput(file); 271 break; 272 } 273 ctx->file_table.data.nodes[i] = node; 274 if (tag) 275 node->tag = tag; 276 io_fixed_file_set(node, file); 277 io_file_bitmap_set(&ctx->file_table, i); 278 } 279 } 280 return done ? done : err; 281 } 282 283 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 284 struct io_uring_rsrc_update2 *up, 285 unsigned int nr_args) 286 { 287 u64 __user *tags = u64_to_user_ptr(up->tags); 288 struct iovec fast_iov, *iov; 289 struct page *last_hpage = NULL; 290 struct iovec __user *uvec; 291 u64 user_data = up->data; 292 __u32 done; 293 int i, err; 294 295 if (!ctx->buf_table.nr) 296 return -ENXIO; 297 if (up->offset + nr_args > ctx->buf_table.nr) 298 return -EINVAL; 299 300 for (done = 0; done < nr_args; done++) { 301 struct io_rsrc_node *node; 302 u64 tag = 0; 303 304 uvec = u64_to_user_ptr(user_data); 305 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 306 if (IS_ERR(iov)) { 307 err = PTR_ERR(iov); 308 break; 309 } 310 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 311 err = -EFAULT; 312 break; 313 } 314 err = io_buffer_validate(iov); 315 if (err) 316 break; 317 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 318 if (IS_ERR(node)) { 319 err = PTR_ERR(node); 320 break; 321 } 322 if (tag) { 323 if (!node) { 324 err = -EINVAL; 325 break; 326 } 327 node->tag = tag; 328 } 329 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 330 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 331 ctx->buf_table.nodes[i] = node; 332 if (ctx->compat) 333 user_data += sizeof(struct compat_iovec); 334 else 335 user_data += sizeof(struct iovec); 336 } 337 return done ? done : err; 338 } 339 340 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 341 struct io_uring_rsrc_update2 *up, 342 unsigned nr_args) 343 { 344 __u32 tmp; 345 346 lockdep_assert_held(&ctx->uring_lock); 347 348 if (check_add_overflow(up->offset, nr_args, &tmp)) 349 return -EOVERFLOW; 350 351 switch (type) { 352 case IORING_RSRC_FILE: 353 return __io_sqe_files_update(ctx, up, nr_args); 354 case IORING_RSRC_BUFFER: 355 return __io_sqe_buffers_update(ctx, up, nr_args); 356 } 357 return -EINVAL; 358 } 359 360 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 361 unsigned nr_args) 362 { 363 struct io_uring_rsrc_update2 up; 364 365 if (!nr_args) 366 return -EINVAL; 367 memset(&up, 0, sizeof(up)); 368 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 369 return -EFAULT; 370 if (up.resv || up.resv2) 371 return -EINVAL; 372 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 373 } 374 375 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 376 unsigned size, unsigned type) 377 { 378 struct io_uring_rsrc_update2 up; 379 380 if (size != sizeof(up)) 381 return -EINVAL; 382 if (copy_from_user(&up, arg, sizeof(up))) 383 return -EFAULT; 384 if (!up.nr || up.resv || up.resv2) 385 return -EINVAL; 386 return __io_register_rsrc_update(ctx, type, &up, up.nr); 387 } 388 389 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 390 unsigned int size, unsigned int type) 391 { 392 struct io_uring_rsrc_register rr; 393 394 /* keep it extendible */ 395 if (size != sizeof(rr)) 396 return -EINVAL; 397 398 memset(&rr, 0, sizeof(rr)); 399 if (copy_from_user(&rr, arg, size)) 400 return -EFAULT; 401 if (!rr.nr || rr.resv2) 402 return -EINVAL; 403 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 404 return -EINVAL; 405 406 switch (type) { 407 case IORING_RSRC_FILE: 408 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 409 break; 410 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 411 rr.nr, u64_to_user_ptr(rr.tags)); 412 case IORING_RSRC_BUFFER: 413 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 414 break; 415 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 416 rr.nr, u64_to_user_ptr(rr.tags)); 417 } 418 return -EINVAL; 419 } 420 421 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 422 { 423 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 424 425 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 426 return -EINVAL; 427 if (sqe->rw_flags || sqe->splice_fd_in) 428 return -EINVAL; 429 430 up->offset = READ_ONCE(sqe->off); 431 up->nr_args = READ_ONCE(sqe->len); 432 if (!up->nr_args) 433 return -EINVAL; 434 up->arg = READ_ONCE(sqe->addr); 435 return 0; 436 } 437 438 static int io_files_update_with_index_alloc(struct io_kiocb *req, 439 unsigned int issue_flags) 440 { 441 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 442 __s32 __user *fds = u64_to_user_ptr(up->arg); 443 unsigned int done; 444 struct file *file; 445 int ret, fd; 446 447 if (!req->ctx->file_table.data.nr) 448 return -ENXIO; 449 450 for (done = 0; done < up->nr_args; done++) { 451 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 452 ret = -EFAULT; 453 break; 454 } 455 456 file = fget(fd); 457 if (!file) { 458 ret = -EBADF; 459 break; 460 } 461 ret = io_fixed_fd_install(req, issue_flags, file, 462 IORING_FILE_INDEX_ALLOC); 463 if (ret < 0) 464 break; 465 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 466 __io_close_fixed(req->ctx, issue_flags, ret); 467 ret = -EFAULT; 468 break; 469 } 470 } 471 472 if (done) 473 return done; 474 return ret; 475 } 476 477 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 478 { 479 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 480 struct io_ring_ctx *ctx = req->ctx; 481 struct io_uring_rsrc_update2 up2; 482 int ret; 483 484 up2.offset = up->offset; 485 up2.data = up->arg; 486 up2.nr = 0; 487 up2.tags = 0; 488 up2.resv = 0; 489 up2.resv2 = 0; 490 491 if (up->offset == IORING_FILE_INDEX_ALLOC) { 492 ret = io_files_update_with_index_alloc(req, issue_flags); 493 } else { 494 io_ring_submit_lock(ctx, issue_flags); 495 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 496 &up2, up->nr_args); 497 io_ring_submit_unlock(ctx, issue_flags); 498 } 499 500 if (ret < 0) 501 req_set_fail(req); 502 io_req_set_res(req, ret, 0); 503 return IOU_COMPLETE; 504 } 505 506 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 507 { 508 if (node->tag) 509 io_post_aux_cqe(ctx, node->tag, 0, 0); 510 511 switch (node->type) { 512 case IORING_RSRC_FILE: 513 fput(io_slot_file(node)); 514 break; 515 case IORING_RSRC_BUFFER: 516 io_buffer_unmap(ctx, node->buf); 517 break; 518 default: 519 WARN_ON_ONCE(1); 520 break; 521 } 522 523 io_cache_free(&ctx->node_cache, node); 524 } 525 526 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 527 { 528 if (!ctx->file_table.data.nr) 529 return -ENXIO; 530 531 io_free_file_tables(ctx, &ctx->file_table); 532 io_file_table_set_alloc_range(ctx, 0, 0); 533 return 0; 534 } 535 536 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 537 unsigned nr_args, u64 __user *tags) 538 { 539 __s32 __user *fds = (__s32 __user *) arg; 540 struct file *file; 541 int fd, ret; 542 unsigned i; 543 544 if (ctx->file_table.data.nr) 545 return -EBUSY; 546 if (!nr_args) 547 return -EINVAL; 548 if (nr_args > IORING_MAX_FIXED_FILES) 549 return -EMFILE; 550 if (nr_args > rlimit(RLIMIT_NOFILE)) 551 return -EMFILE; 552 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 553 return -ENOMEM; 554 555 for (i = 0; i < nr_args; i++) { 556 struct io_rsrc_node *node; 557 u64 tag = 0; 558 559 ret = -EFAULT; 560 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 561 goto fail; 562 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 563 goto fail; 564 /* allow sparse sets */ 565 if (!fds || fd == -1) { 566 ret = -EINVAL; 567 if (tag) 568 goto fail; 569 continue; 570 } 571 572 file = fget(fd); 573 ret = -EBADF; 574 if (unlikely(!file)) 575 goto fail; 576 577 /* 578 * Don't allow io_uring instances to be registered. 579 */ 580 if (io_is_uring_fops(file)) { 581 fput(file); 582 goto fail; 583 } 584 ret = -ENOMEM; 585 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 586 if (!node) { 587 fput(file); 588 goto fail; 589 } 590 if (tag) 591 node->tag = tag; 592 ctx->file_table.data.nodes[i] = node; 593 io_fixed_file_set(node, file); 594 io_file_bitmap_set(&ctx->file_table, i); 595 } 596 597 /* default it to the whole table */ 598 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 599 return 0; 600 fail: 601 io_clear_table_tags(&ctx->file_table.data); 602 io_sqe_files_unregister(ctx); 603 return ret; 604 } 605 606 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 607 { 608 if (!ctx->buf_table.nr) 609 return -ENXIO; 610 io_rsrc_data_free(ctx, &ctx->buf_table); 611 return 0; 612 } 613 614 /* 615 * Not super efficient, but this is just a registration time. And we do cache 616 * the last compound head, so generally we'll only do a full search if we don't 617 * match that one. 618 * 619 * We check if the given compound head page has already been accounted, to 620 * avoid double accounting it. This allows us to account the full size of the 621 * page, not just the constituent pages of a huge page. 622 */ 623 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 624 int nr_pages, struct page *hpage) 625 { 626 int i, j; 627 628 /* check current page array */ 629 for (i = 0; i < nr_pages; i++) { 630 if (!PageCompound(pages[i])) 631 continue; 632 if (compound_head(pages[i]) == hpage) 633 return true; 634 } 635 636 /* check previously registered pages */ 637 for (i = 0; i < ctx->buf_table.nr; i++) { 638 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 639 struct io_mapped_ubuf *imu; 640 641 if (!node) 642 continue; 643 imu = node->buf; 644 for (j = 0; j < imu->nr_bvecs; j++) { 645 if (!PageCompound(imu->bvec[j].bv_page)) 646 continue; 647 if (compound_head(imu->bvec[j].bv_page) == hpage) 648 return true; 649 } 650 } 651 652 return false; 653 } 654 655 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 656 int nr_pages, struct io_mapped_ubuf *imu, 657 struct page **last_hpage) 658 { 659 int i, ret; 660 661 imu->acct_pages = 0; 662 for (i = 0; i < nr_pages; i++) { 663 if (!PageCompound(pages[i])) { 664 imu->acct_pages++; 665 } else { 666 struct page *hpage; 667 668 hpage = compound_head(pages[i]); 669 if (hpage == *last_hpage) 670 continue; 671 *last_hpage = hpage; 672 if (headpage_already_acct(ctx, pages, i, hpage)) 673 continue; 674 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 675 } 676 } 677 678 if (!imu->acct_pages) 679 return 0; 680 681 ret = io_account_mem(ctx, imu->acct_pages); 682 if (ret) 683 imu->acct_pages = 0; 684 return ret; 685 } 686 687 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 688 struct io_imu_folio_data *data) 689 { 690 struct page **page_array = *pages, **new_array = NULL; 691 unsigned nr_pages_left = *nr_pages; 692 unsigned nr_folios = data->nr_folios; 693 unsigned i, j; 694 695 /* Store head pages only*/ 696 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); 697 if (!new_array) 698 return false; 699 700 for (i = 0, j = 0; i < nr_folios; i++) { 701 struct page *p = compound_head(page_array[j]); 702 struct folio *folio = page_folio(p); 703 unsigned int nr; 704 705 WARN_ON_ONCE(i > 0 && p != page_array[j]); 706 707 nr = i ? data->nr_pages_mid : data->nr_pages_head; 708 nr = min(nr, nr_pages_left); 709 /* Drop all but one ref, the entire folio will remain pinned. */ 710 if (nr > 1) 711 unpin_user_folio(folio, nr - 1); 712 j += nr; 713 nr_pages_left -= nr; 714 new_array[i] = p; 715 } 716 717 WARN_ON_ONCE(j != *nr_pages); 718 719 kvfree(page_array); 720 *pages = new_array; 721 *nr_pages = nr_folios; 722 return true; 723 } 724 725 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 726 struct io_imu_folio_data *data) 727 { 728 struct folio *folio = page_folio(page_array[0]); 729 unsigned int count = 1, nr_folios = 1; 730 int i; 731 732 data->nr_pages_mid = folio_nr_pages(folio); 733 data->folio_shift = folio_shift(folio); 734 735 /* 736 * Check if pages are contiguous inside a folio, and all folios have 737 * the same page count except for the head and tail. 738 */ 739 for (i = 1; i < nr_pages; i++) { 740 if (page_folio(page_array[i]) == folio && 741 page_array[i] == page_array[i-1] + 1) { 742 count++; 743 continue; 744 } 745 746 if (nr_folios == 1) { 747 if (folio_page_idx(folio, page_array[i-1]) != 748 data->nr_pages_mid - 1) 749 return false; 750 751 data->nr_pages_head = count; 752 } else if (count != data->nr_pages_mid) { 753 return false; 754 } 755 756 folio = page_folio(page_array[i]); 757 if (folio_size(folio) != (1UL << data->folio_shift) || 758 folio_page_idx(folio, page_array[i]) != 0) 759 return false; 760 761 count = 1; 762 nr_folios++; 763 } 764 if (nr_folios == 1) 765 data->nr_pages_head = count; 766 767 data->nr_folios = nr_folios; 768 return true; 769 } 770 771 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 772 struct iovec *iov, 773 struct page **last_hpage) 774 { 775 struct io_mapped_ubuf *imu = NULL; 776 struct page **pages = NULL; 777 struct io_rsrc_node *node; 778 unsigned long off; 779 size_t size; 780 int ret, nr_pages, i; 781 struct io_imu_folio_data data; 782 bool coalesced = false; 783 784 if (!iov->iov_base) 785 return NULL; 786 787 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 788 if (!node) 789 return ERR_PTR(-ENOMEM); 790 791 ret = -ENOMEM; 792 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 793 &nr_pages); 794 if (IS_ERR(pages)) { 795 ret = PTR_ERR(pages); 796 pages = NULL; 797 goto done; 798 } 799 800 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 801 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 802 if (data.nr_pages_mid != 1) 803 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 804 } 805 806 imu = io_alloc_imu(ctx, nr_pages); 807 if (!imu) 808 goto done; 809 810 imu->nr_bvecs = nr_pages; 811 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 812 if (ret) 813 goto done; 814 815 size = iov->iov_len; 816 /* store original address for later verification */ 817 imu->ubuf = (unsigned long) iov->iov_base; 818 imu->len = iov->iov_len; 819 imu->folio_shift = PAGE_SHIFT; 820 imu->release = io_release_ubuf; 821 imu->priv = imu; 822 imu->is_kbuf = false; 823 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 824 if (coalesced) 825 imu->folio_shift = data.folio_shift; 826 refcount_set(&imu->refs, 1); 827 off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); 828 node->buf = imu; 829 ret = 0; 830 831 for (i = 0; i < nr_pages; i++) { 832 size_t vec_len; 833 834 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 835 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 836 off = 0; 837 size -= vec_len; 838 } 839 done: 840 if (ret) { 841 if (imu) 842 io_free_imu(ctx, imu); 843 if (pages) 844 unpin_user_pages(pages, nr_pages); 845 io_cache_free(&ctx->node_cache, node); 846 node = ERR_PTR(ret); 847 } 848 kvfree(pages); 849 return node; 850 } 851 852 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 853 unsigned int nr_args, u64 __user *tags) 854 { 855 struct page *last_hpage = NULL; 856 struct io_rsrc_data data; 857 struct iovec fast_iov, *iov = &fast_iov; 858 const struct iovec __user *uvec; 859 int i, ret; 860 861 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 862 863 if (ctx->buf_table.nr) 864 return -EBUSY; 865 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 866 return -EINVAL; 867 ret = io_rsrc_data_alloc(&data, nr_args); 868 if (ret) 869 return ret; 870 871 if (!arg) 872 memset(iov, 0, sizeof(*iov)); 873 874 for (i = 0; i < nr_args; i++) { 875 struct io_rsrc_node *node; 876 u64 tag = 0; 877 878 if (arg) { 879 uvec = (struct iovec __user *) arg; 880 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 881 if (IS_ERR(iov)) { 882 ret = PTR_ERR(iov); 883 break; 884 } 885 ret = io_buffer_validate(iov); 886 if (ret) 887 break; 888 if (ctx->compat) 889 arg += sizeof(struct compat_iovec); 890 else 891 arg += sizeof(struct iovec); 892 } 893 894 if (tags) { 895 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 896 ret = -EFAULT; 897 break; 898 } 899 } 900 901 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 902 if (IS_ERR(node)) { 903 ret = PTR_ERR(node); 904 break; 905 } 906 if (tag) { 907 if (!node) { 908 ret = -EINVAL; 909 break; 910 } 911 node->tag = tag; 912 } 913 data.nodes[i] = node; 914 } 915 916 ctx->buf_table = data; 917 if (ret) { 918 io_clear_table_tags(&ctx->buf_table); 919 io_sqe_buffers_unregister(ctx); 920 } 921 return ret; 922 } 923 924 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 925 void (*release)(void *), unsigned int index, 926 unsigned int issue_flags) 927 { 928 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 929 struct io_rsrc_data *data = &ctx->buf_table; 930 struct req_iterator rq_iter; 931 struct io_mapped_ubuf *imu; 932 struct io_rsrc_node *node; 933 struct bio_vec bv, *bvec; 934 u16 nr_bvecs; 935 int ret = 0; 936 937 io_ring_submit_lock(ctx, issue_flags); 938 if (index >= data->nr) { 939 ret = -EINVAL; 940 goto unlock; 941 } 942 index = array_index_nospec(index, data->nr); 943 944 if (data->nodes[index]) { 945 ret = -EBUSY; 946 goto unlock; 947 } 948 949 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 950 if (!node) { 951 ret = -ENOMEM; 952 goto unlock; 953 } 954 955 nr_bvecs = blk_rq_nr_phys_segments(rq); 956 imu = io_alloc_imu(ctx, nr_bvecs); 957 if (!imu) { 958 kfree(node); 959 ret = -ENOMEM; 960 goto unlock; 961 } 962 963 imu->ubuf = 0; 964 imu->len = blk_rq_bytes(rq); 965 imu->acct_pages = 0; 966 imu->folio_shift = PAGE_SHIFT; 967 imu->nr_bvecs = nr_bvecs; 968 refcount_set(&imu->refs, 1); 969 imu->release = release; 970 imu->priv = rq; 971 imu->is_kbuf = true; 972 imu->dir = 1 << rq_data_dir(rq); 973 974 bvec = imu->bvec; 975 rq_for_each_bvec(bv, rq, rq_iter) 976 *bvec++ = bv; 977 978 node->buf = imu; 979 data->nodes[index] = node; 980 unlock: 981 io_ring_submit_unlock(ctx, issue_flags); 982 return ret; 983 } 984 EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 985 986 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 987 unsigned int issue_flags) 988 { 989 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 990 struct io_rsrc_data *data = &ctx->buf_table; 991 struct io_rsrc_node *node; 992 int ret = 0; 993 994 io_ring_submit_lock(ctx, issue_flags); 995 if (index >= data->nr) { 996 ret = -EINVAL; 997 goto unlock; 998 } 999 index = array_index_nospec(index, data->nr); 1000 1001 node = data->nodes[index]; 1002 if (!node) { 1003 ret = -EINVAL; 1004 goto unlock; 1005 } 1006 if (!node->buf->is_kbuf) { 1007 ret = -EBUSY; 1008 goto unlock; 1009 } 1010 1011 io_put_rsrc_node(ctx, node); 1012 data->nodes[index] = NULL; 1013 unlock: 1014 io_ring_submit_unlock(ctx, issue_flags); 1015 return ret; 1016 } 1017 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 1018 1019 static int validate_fixed_range(u64 buf_addr, size_t len, 1020 const struct io_mapped_ubuf *imu) 1021 { 1022 u64 buf_end; 1023 1024 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1025 return -EFAULT; 1026 /* not inside the mapped region */ 1027 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1028 return -EFAULT; 1029 if (unlikely(len > MAX_RW_COUNT)) 1030 return -EFAULT; 1031 return 0; 1032 } 1033 1034 static int io_import_kbuf(int ddir, struct iov_iter *iter, 1035 struct io_mapped_ubuf *imu, size_t len, size_t offset) 1036 { 1037 size_t count = len + offset; 1038 1039 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); 1040 iov_iter_advance(iter, offset); 1041 1042 if (count < imu->len) { 1043 const struct bio_vec *bvec = iter->bvec; 1044 1045 while (len > bvec->bv_len) { 1046 len -= bvec->bv_len; 1047 bvec++; 1048 } 1049 iter->nr_segs = 1 + bvec - iter->bvec; 1050 } 1051 return 0; 1052 } 1053 1054 static int io_import_fixed(int ddir, struct iov_iter *iter, 1055 struct io_mapped_ubuf *imu, 1056 u64 buf_addr, size_t len) 1057 { 1058 const struct bio_vec *bvec; 1059 size_t folio_mask; 1060 unsigned nr_segs; 1061 size_t offset; 1062 int ret; 1063 1064 ret = validate_fixed_range(buf_addr, len, imu); 1065 if (unlikely(ret)) 1066 return ret; 1067 if (!(imu->dir & (1 << ddir))) 1068 return -EFAULT; 1069 1070 offset = buf_addr - imu->ubuf; 1071 1072 if (imu->is_kbuf) 1073 return io_import_kbuf(ddir, iter, imu, len, offset); 1074 1075 /* 1076 * Don't use iov_iter_advance() here, as it's really slow for 1077 * using the latter parts of a big fixed buffer - it iterates 1078 * over each segment manually. We can cheat a bit here for user 1079 * registered nodes, because we know that: 1080 * 1081 * 1) it's a BVEC iter, we set it up 1082 * 2) all bvecs are the same in size, except potentially the 1083 * first and last bvec 1084 */ 1085 folio_mask = (1UL << imu->folio_shift) - 1; 1086 bvec = imu->bvec; 1087 if (offset >= bvec->bv_len) { 1088 unsigned long seg_skip; 1089 1090 /* skip first vec */ 1091 offset -= bvec->bv_len; 1092 seg_skip = 1 + (offset >> imu->folio_shift); 1093 bvec += seg_skip; 1094 offset &= folio_mask; 1095 } 1096 nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; 1097 iov_iter_bvec(iter, ddir, bvec, nr_segs, len); 1098 iter->iov_offset = offset; 1099 return 0; 1100 } 1101 1102 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 1103 unsigned issue_flags) 1104 { 1105 struct io_ring_ctx *ctx = req->ctx; 1106 struct io_rsrc_node *node; 1107 1108 if (req->flags & REQ_F_BUF_NODE) 1109 return req->buf_node; 1110 req->flags |= REQ_F_BUF_NODE; 1111 1112 io_ring_submit_lock(ctx, issue_flags); 1113 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 1114 if (node) { 1115 node->refs++; 1116 req->buf_node = node; 1117 io_ring_submit_unlock(ctx, issue_flags); 1118 return node; 1119 } 1120 req->flags &= ~REQ_F_BUF_NODE; 1121 io_ring_submit_unlock(ctx, issue_flags); 1122 return NULL; 1123 } 1124 1125 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 1126 u64 buf_addr, size_t len, int ddir, 1127 unsigned issue_flags) 1128 { 1129 struct io_rsrc_node *node; 1130 1131 node = io_find_buf_node(req, issue_flags); 1132 if (!node) 1133 return -EFAULT; 1134 return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1135 } 1136 1137 /* Lock two rings at once. The rings must be different! */ 1138 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 1139 { 1140 if (ctx1 > ctx2) 1141 swap(ctx1, ctx2); 1142 mutex_lock(&ctx1->uring_lock); 1143 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 1144 } 1145 1146 /* Both rings are locked by the caller. */ 1147 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 1148 struct io_uring_clone_buffers *arg) 1149 { 1150 struct io_rsrc_data data; 1151 int i, ret, off, nr; 1152 unsigned int nbufs; 1153 1154 lockdep_assert_held(&ctx->uring_lock); 1155 lockdep_assert_held(&src_ctx->uring_lock); 1156 1157 /* 1158 * Accounting state is shared between the two rings; that only works if 1159 * both rings are accounted towards the same counters. 1160 */ 1161 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 1162 return -EINVAL; 1163 1164 /* if offsets are given, must have nr specified too */ 1165 if (!arg->nr && (arg->dst_off || arg->src_off)) 1166 return -EINVAL; 1167 /* not allowed unless REPLACE is set */ 1168 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 1169 return -EBUSY; 1170 1171 nbufs = src_ctx->buf_table.nr; 1172 if (!arg->nr) 1173 arg->nr = nbufs; 1174 else if (arg->nr > nbufs) 1175 return -EINVAL; 1176 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1177 return -EINVAL; 1178 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1179 return -EOVERFLOW; 1180 if (nbufs > IORING_MAX_REG_BUFFERS) 1181 return -EINVAL; 1182 1183 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 1184 if (ret) 1185 return ret; 1186 1187 /* Fill entries in data from dst that won't overlap with src */ 1188 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1189 struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; 1190 1191 if (src_node) { 1192 data.nodes[i] = src_node; 1193 src_node->refs++; 1194 } 1195 } 1196 1197 ret = -ENXIO; 1198 nbufs = src_ctx->buf_table.nr; 1199 if (!nbufs) 1200 goto out_free; 1201 ret = -EINVAL; 1202 if (!arg->nr) 1203 arg->nr = nbufs; 1204 else if (arg->nr > nbufs) 1205 goto out_free; 1206 ret = -EOVERFLOW; 1207 if (check_add_overflow(arg->nr, arg->src_off, &off)) 1208 goto out_free; 1209 if (off > nbufs) 1210 goto out_free; 1211 1212 off = arg->dst_off; 1213 i = arg->src_off; 1214 nr = arg->nr; 1215 while (nr--) { 1216 struct io_rsrc_node *dst_node, *src_node; 1217 1218 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1219 if (!src_node) { 1220 dst_node = NULL; 1221 } else { 1222 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1223 if (!dst_node) { 1224 ret = -ENOMEM; 1225 goto out_free; 1226 } 1227 1228 refcount_inc(&src_node->buf->refs); 1229 dst_node->buf = src_node->buf; 1230 } 1231 data.nodes[off++] = dst_node; 1232 i++; 1233 } 1234 1235 /* 1236 * If asked for replace, put the old table. data->nodes[] holds both 1237 * old and new nodes at this point. 1238 */ 1239 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1240 io_rsrc_data_free(ctx, &ctx->buf_table); 1241 1242 /* 1243 * ctx->buf_table must be empty now - either the contents are being 1244 * replaced and we just freed the table, or the contents are being 1245 * copied to a ring that does not have buffers yet (checked at function 1246 * entry). 1247 */ 1248 WARN_ON_ONCE(ctx->buf_table.nr); 1249 ctx->buf_table = data; 1250 return 0; 1251 1252 out_free: 1253 io_rsrc_data_free(ctx, &data); 1254 return ret; 1255 } 1256 1257 /* 1258 * Copy the registered buffers from the source ring whose file descriptor 1259 * is given in the src_fd to the current ring. This is identical to registering 1260 * the buffers with ctx, except faster as mappings already exist. 1261 * 1262 * Since the memory is already accounted once, don't account it again. 1263 */ 1264 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1265 { 1266 struct io_uring_clone_buffers buf; 1267 struct io_ring_ctx *src_ctx; 1268 bool registered_src; 1269 struct file *file; 1270 int ret; 1271 1272 if (copy_from_user(&buf, arg, sizeof(buf))) 1273 return -EFAULT; 1274 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1275 return -EINVAL; 1276 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1277 return -EBUSY; 1278 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1279 return -EINVAL; 1280 1281 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1282 file = io_uring_register_get_file(buf.src_fd, registered_src); 1283 if (IS_ERR(file)) 1284 return PTR_ERR(file); 1285 1286 src_ctx = file->private_data; 1287 if (src_ctx != ctx) { 1288 mutex_unlock(&ctx->uring_lock); 1289 lock_two_rings(ctx, src_ctx); 1290 } 1291 1292 ret = io_clone_buffers(ctx, src_ctx, &buf); 1293 1294 if (src_ctx != ctx) 1295 mutex_unlock(&src_ctx->uring_lock); 1296 1297 fput(file); 1298 return ret; 1299 } 1300 1301 void io_vec_free(struct iou_vec *iv) 1302 { 1303 if (!iv->iovec) 1304 return; 1305 kfree(iv->iovec); 1306 iv->iovec = NULL; 1307 iv->nr = 0; 1308 } 1309 1310 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1311 { 1312 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1313 struct iovec *iov; 1314 1315 iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); 1316 if (!iov) 1317 return -ENOMEM; 1318 1319 io_vec_free(iv); 1320 iv->iovec = iov; 1321 iv->nr = nr_entries; 1322 return 0; 1323 } 1324 1325 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, 1326 struct io_mapped_ubuf *imu, 1327 struct iovec *iovec, unsigned nr_iovs, 1328 struct iou_vec *vec) 1329 { 1330 unsigned long folio_size = 1 << imu->folio_shift; 1331 unsigned long folio_mask = folio_size - 1; 1332 u64 folio_addr = imu->ubuf & ~folio_mask; 1333 struct bio_vec *res_bvec = vec->bvec; 1334 size_t total_len = 0; 1335 unsigned bvec_idx = 0; 1336 unsigned iov_idx; 1337 1338 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1339 size_t iov_len = iovec[iov_idx].iov_len; 1340 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; 1341 struct bio_vec *src_bvec; 1342 size_t offset; 1343 int ret; 1344 1345 ret = validate_fixed_range(buf_addr, iov_len, imu); 1346 if (unlikely(ret)) 1347 return ret; 1348 1349 if (unlikely(!iov_len)) 1350 return -EFAULT; 1351 if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) 1352 return -EOVERFLOW; 1353 1354 /* by using folio address it also accounts for bvec offset */ 1355 offset = buf_addr - folio_addr; 1356 src_bvec = imu->bvec + (offset >> imu->folio_shift); 1357 offset &= folio_mask; 1358 1359 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { 1360 size_t seg_size = min_t(size_t, iov_len, 1361 folio_size - offset); 1362 1363 bvec_set_page(&res_bvec[bvec_idx], 1364 src_bvec->bv_page, seg_size, offset); 1365 iov_len -= seg_size; 1366 } 1367 } 1368 if (total_len > MAX_RW_COUNT) 1369 return -EINVAL; 1370 1371 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); 1372 return 0; 1373 } 1374 1375 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, 1376 struct io_mapped_ubuf *imu) 1377 { 1378 unsigned shift = imu->folio_shift; 1379 size_t max_segs = 0; 1380 unsigned i; 1381 1382 for (i = 0; i < nr_iovs; i++) 1383 max_segs += (iov[i].iov_len >> shift) + 2; 1384 return max_segs; 1385 } 1386 1387 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, 1388 struct io_mapped_ubuf *imu, 1389 struct iovec *iovec, unsigned nr_iovs, 1390 struct iou_vec *vec) 1391 { 1392 const struct bio_vec *src_bvec = imu->bvec; 1393 struct bio_vec *res_bvec = vec->bvec; 1394 unsigned res_idx = 0; 1395 size_t total_len = 0; 1396 unsigned iov_idx; 1397 1398 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1399 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; 1400 size_t iov_len = iovec[iov_idx].iov_len; 1401 struct bvec_iter bi = { 1402 .bi_size = offset + iov_len, 1403 }; 1404 struct bio_vec bv; 1405 1406 bvec_iter_advance(src_bvec, &bi, offset); 1407 for_each_mp_bvec(bv, src_bvec, bi, bi) 1408 res_bvec[res_idx++] = bv; 1409 total_len += iov_len; 1410 } 1411 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); 1412 return 0; 1413 } 1414 1415 static int iov_kern_bvec_size(const struct iovec *iov, 1416 const struct io_mapped_ubuf *imu, 1417 unsigned int *nr_seg) 1418 { 1419 size_t offset = (size_t)(uintptr_t)iov->iov_base; 1420 const struct bio_vec *bvec = imu->bvec; 1421 int start = 0, i = 0; 1422 size_t off = 0; 1423 int ret; 1424 1425 ret = validate_fixed_range(offset, iov->iov_len, imu); 1426 if (unlikely(ret)) 1427 return ret; 1428 1429 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; 1430 off += bvec[i].bv_len, i++) { 1431 if (offset >= off && offset < off + bvec[i].bv_len) 1432 start = i; 1433 } 1434 *nr_seg = i - start; 1435 return 0; 1436 } 1437 1438 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, 1439 struct io_mapped_ubuf *imu, unsigned *nr_segs) 1440 { 1441 unsigned max_segs = 0; 1442 size_t total_len = 0; 1443 unsigned i; 1444 int ret; 1445 1446 *nr_segs = 0; 1447 for (i = 0; i < nr_iovs; i++) { 1448 if (unlikely(!iov[i].iov_len)) 1449 return -EFAULT; 1450 if (unlikely(check_add_overflow(total_len, iov[i].iov_len, 1451 &total_len))) 1452 return -EOVERFLOW; 1453 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); 1454 if (unlikely(ret)) 1455 return ret; 1456 *nr_segs += max_segs; 1457 } 1458 if (total_len > MAX_RW_COUNT) 1459 return -EINVAL; 1460 return 0; 1461 } 1462 1463 int io_import_reg_vec(int ddir, struct iov_iter *iter, 1464 struct io_kiocb *req, struct iou_vec *vec, 1465 unsigned nr_iovs, unsigned issue_flags) 1466 { 1467 struct io_rsrc_node *node; 1468 struct io_mapped_ubuf *imu; 1469 unsigned iovec_off; 1470 struct iovec *iov; 1471 unsigned nr_segs; 1472 1473 node = io_find_buf_node(req, issue_flags); 1474 if (!node) 1475 return -EFAULT; 1476 imu = node->buf; 1477 if (!(imu->dir & (1 << ddir))) 1478 return -EFAULT; 1479 1480 iovec_off = vec->nr - nr_iovs; 1481 iov = vec->iovec + iovec_off; 1482 1483 if (imu->is_kbuf) { 1484 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); 1485 1486 if (unlikely(ret)) 1487 return ret; 1488 } else { 1489 nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); 1490 } 1491 1492 if (sizeof(struct bio_vec) > sizeof(struct iovec)) { 1493 size_t bvec_bytes; 1494 1495 bvec_bytes = nr_segs * sizeof(struct bio_vec); 1496 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); 1497 nr_segs += nr_iovs; 1498 } 1499 1500 if (nr_segs > vec->nr) { 1501 struct iou_vec tmp_vec = {}; 1502 int ret; 1503 1504 ret = io_vec_realloc(&tmp_vec, nr_segs); 1505 if (ret) 1506 return ret; 1507 1508 iovec_off = tmp_vec.nr - nr_iovs; 1509 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); 1510 io_vec_free(vec); 1511 1512 *vec = tmp_vec; 1513 iov = vec->iovec + iovec_off; 1514 req->flags |= REQ_F_NEED_CLEANUP; 1515 } 1516 1517 if (imu->is_kbuf) 1518 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1519 1520 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1521 } 1522 1523 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, 1524 const struct iovec __user *uvec, size_t uvec_segs) 1525 { 1526 struct iovec *iov; 1527 int iovec_off, ret; 1528 void *res; 1529 1530 if (uvec_segs > iv->nr) { 1531 ret = io_vec_realloc(iv, uvec_segs); 1532 if (ret) 1533 return ret; 1534 req->flags |= REQ_F_NEED_CLEANUP; 1535 } 1536 1537 /* pad iovec to the right */ 1538 iovec_off = iv->nr - uvec_segs; 1539 iov = iv->iovec + iovec_off; 1540 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, 1541 io_is_compat(req->ctx)); 1542 if (IS_ERR(res)) 1543 return PTR_ERR(res); 1544 1545 req->flags |= REQ_F_IMPORT_BUFFER; 1546 return 0; 1547 } 1548