1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 #include <linux/io_uring/cmd.h> 13 14 #include <uapi/linux/io_uring.h> 15 16 #include "filetable.h" 17 #include "io_uring.h" 18 #include "openclose.h" 19 #include "rsrc.h" 20 #include "memmap.h" 21 #include "register.h" 22 23 struct io_rsrc_update { 24 struct file *file; 25 u64 arg; 26 u32 nr_args; 27 u32 offset; 28 }; 29 30 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 31 struct iovec *iov, struct page **last_hpage); 32 33 /* only define max */ 34 #define IORING_MAX_FIXED_FILES (1U << 20) 35 #define IORING_MAX_REG_BUFFERS (1U << 14) 36 37 #define IO_CACHED_BVECS_SEGS 32 38 39 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 40 { 41 unsigned long page_limit, cur_pages, new_pages; 42 43 if (!nr_pages) 44 return 0; 45 46 /* Don't allow more pages than we can safely lock */ 47 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 48 49 cur_pages = atomic_long_read(&user->locked_vm); 50 do { 51 new_pages = cur_pages + nr_pages; 52 if (new_pages > page_limit) 53 return -ENOMEM; 54 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 55 &cur_pages, new_pages)); 56 return 0; 57 } 58 59 void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, 60 unsigned long nr_pages) 61 { 62 if (user) 63 __io_unaccount_mem(user, nr_pages); 64 65 if (mm_account) 66 atomic64_sub(nr_pages, &mm_account->pinned_vm); 67 } 68 69 int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, 70 unsigned long nr_pages) 71 { 72 int ret; 73 74 if (user) { 75 ret = __io_account_mem(user, nr_pages); 76 if (ret) 77 return ret; 78 } 79 80 if (mm_account) 81 atomic64_add(nr_pages, &mm_account->pinned_vm); 82 83 return 0; 84 } 85 86 int io_validate_user_buf_range(u64 uaddr, u64 ulen) 87 { 88 unsigned long tmp, base = (unsigned long)uaddr; 89 unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); 90 91 /* arbitrary limit, but we need something */ 92 if (ulen > SZ_1G || !ulen) 93 return -EFAULT; 94 if (check_add_overflow(base, acct_len, &tmp)) 95 return -EOVERFLOW; 96 return 0; 97 } 98 99 static void io_release_ubuf(void *priv) 100 { 101 struct io_mapped_ubuf *imu = priv; 102 unsigned int i; 103 104 for (i = 0; i < imu->nr_bvecs; i++) { 105 struct folio *folio = page_folio(imu->bvec[i].bv_page); 106 107 unpin_user_folio(folio, 1); 108 } 109 } 110 111 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 112 int nr_bvecs) 113 { 114 if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 115 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 116 return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), 117 GFP_KERNEL); 118 } 119 120 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 121 { 122 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 123 io_cache_free(&ctx->imu_cache, imu); 124 else 125 kvfree(imu); 126 } 127 128 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 129 { 130 if (unlikely(refcount_read(&imu->refs) > 1)) { 131 if (!refcount_dec_and_test(&imu->refs)) 132 return; 133 } 134 135 if (imu->acct_pages) 136 io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages); 137 imu->release(imu->priv); 138 io_free_imu(ctx, imu); 139 } 140 141 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 142 { 143 struct io_rsrc_node *node; 144 145 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 146 if (node) { 147 node->type = type; 148 node->refs = 1; 149 node->tag = 0; 150 node->file_ptr = 0; 151 } 152 return node; 153 } 154 155 bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 156 { 157 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 158 IO_CACHED_BVECS_SEGS); 159 const int node_size = sizeof(struct io_rsrc_node); 160 bool ret; 161 162 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 163 node_size, 0); 164 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 165 imu_cache_size, 0); 166 return ret; 167 } 168 169 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 170 { 171 io_alloc_cache_free(&ctx->node_cache, kfree); 172 io_alloc_cache_free(&ctx->imu_cache, kfree); 173 } 174 175 static void io_clear_table_tags(struct io_rsrc_data *data) 176 { 177 int i; 178 179 for (i = 0; i < data->nr; i++) { 180 struct io_rsrc_node *node = data->nodes[i]; 181 182 if (node) 183 node->tag = 0; 184 } 185 } 186 187 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 188 struct io_rsrc_data *data) 189 { 190 if (!data->nr) 191 return; 192 while (data->nr--) { 193 if (data->nodes[data->nr]) 194 io_put_rsrc_node(ctx, data->nodes[data->nr]); 195 } 196 kvfree(data->nodes); 197 data->nodes = NULL; 198 data->nr = 0; 199 } 200 201 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 202 { 203 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), 204 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 205 if (data->nodes) { 206 data->nr = nr; 207 return 0; 208 } 209 return -ENOMEM; 210 } 211 212 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 213 struct io_uring_rsrc_update2 *up, 214 unsigned nr_args) 215 { 216 u64 __user *tags = u64_to_user_ptr(up->tags); 217 __s32 __user *fds = u64_to_user_ptr(up->data); 218 int fd, i, err = 0; 219 unsigned int done; 220 221 if (!ctx->file_table.data.nr) 222 return -ENXIO; 223 if (up->offset + nr_args > ctx->file_table.data.nr) 224 return -EINVAL; 225 226 for (done = 0; done < nr_args; done++) { 227 u64 tag = 0; 228 229 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 230 copy_from_user(&fd, &fds[done], sizeof(fd))) { 231 err = -EFAULT; 232 break; 233 } 234 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 235 err = -EINVAL; 236 break; 237 } 238 if (fd == IORING_REGISTER_FILES_SKIP) 239 continue; 240 241 i = up->offset + done; 242 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 243 io_file_bitmap_clear(&ctx->file_table, i); 244 245 if (fd != -1) { 246 struct file *file = fget(fd); 247 struct io_rsrc_node *node; 248 249 if (!file) { 250 err = -EBADF; 251 break; 252 } 253 /* 254 * Don't allow io_uring instances to be registered. 255 */ 256 if (io_is_uring_fops(file)) { 257 fput(file); 258 err = -EBADF; 259 break; 260 } 261 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 262 if (!node) { 263 err = -ENOMEM; 264 fput(file); 265 break; 266 } 267 ctx->file_table.data.nodes[i] = node; 268 if (tag) 269 node->tag = tag; 270 io_fixed_file_set(node, file); 271 io_file_bitmap_set(&ctx->file_table, i); 272 } 273 } 274 return done ? done : err; 275 } 276 277 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 278 struct io_uring_rsrc_update2 *up, 279 unsigned int nr_args) 280 { 281 u64 __user *tags = u64_to_user_ptr(up->tags); 282 struct iovec fast_iov, *iov; 283 struct page *last_hpage = NULL; 284 struct iovec __user *uvec; 285 u64 user_data = up->data; 286 __u32 done; 287 int i, err; 288 289 if (!ctx->buf_table.nr) 290 return -ENXIO; 291 if (up->offset + nr_args > ctx->buf_table.nr) 292 return -EINVAL; 293 294 for (done = 0; done < nr_args; done++) { 295 struct io_rsrc_node *node; 296 u64 tag = 0; 297 298 uvec = u64_to_user_ptr(user_data); 299 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 300 if (IS_ERR(iov)) { 301 err = PTR_ERR(iov); 302 break; 303 } 304 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 305 err = -EFAULT; 306 break; 307 } 308 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 309 if (IS_ERR(node)) { 310 err = PTR_ERR(node); 311 break; 312 } 313 if (tag) { 314 if (!node) { 315 err = -EINVAL; 316 break; 317 } 318 node->tag = tag; 319 } 320 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 321 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 322 ctx->buf_table.nodes[i] = node; 323 if (ctx->compat) 324 user_data += sizeof(struct compat_iovec); 325 else 326 user_data += sizeof(struct iovec); 327 } 328 return done ? done : err; 329 } 330 331 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 332 struct io_uring_rsrc_update2 *up, 333 unsigned nr_args) 334 { 335 __u32 tmp; 336 337 lockdep_assert_held(&ctx->uring_lock); 338 339 if (check_add_overflow(up->offset, nr_args, &tmp)) 340 return -EOVERFLOW; 341 342 switch (type) { 343 case IORING_RSRC_FILE: 344 return __io_sqe_files_update(ctx, up, nr_args); 345 case IORING_RSRC_BUFFER: 346 return __io_sqe_buffers_update(ctx, up, nr_args); 347 } 348 return -EINVAL; 349 } 350 351 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 352 unsigned nr_args) 353 { 354 struct io_uring_rsrc_update2 up; 355 356 if (!nr_args) 357 return -EINVAL; 358 memset(&up, 0, sizeof(up)); 359 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 360 return -EFAULT; 361 if (up.resv || up.resv2) 362 return -EINVAL; 363 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 364 } 365 366 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 367 unsigned size, unsigned type) 368 { 369 struct io_uring_rsrc_update2 up; 370 371 if (size != sizeof(up)) 372 return -EINVAL; 373 if (copy_from_user(&up, arg, sizeof(up))) 374 return -EFAULT; 375 if (!up.nr || up.resv || up.resv2) 376 return -EINVAL; 377 return __io_register_rsrc_update(ctx, type, &up, up.nr); 378 } 379 380 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 381 unsigned int size, unsigned int type) 382 { 383 struct io_uring_rsrc_register rr; 384 385 /* keep it extendible */ 386 if (size != sizeof(rr)) 387 return -EINVAL; 388 389 memset(&rr, 0, sizeof(rr)); 390 if (copy_from_user(&rr, arg, size)) 391 return -EFAULT; 392 if (!rr.nr || rr.resv2) 393 return -EINVAL; 394 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 395 return -EINVAL; 396 397 switch (type) { 398 case IORING_RSRC_FILE: 399 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 400 break; 401 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 402 rr.nr, u64_to_user_ptr(rr.tags)); 403 case IORING_RSRC_BUFFER: 404 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 405 break; 406 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 407 rr.nr, u64_to_user_ptr(rr.tags)); 408 } 409 return -EINVAL; 410 } 411 412 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 413 { 414 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 415 416 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 417 return -EINVAL; 418 if (sqe->rw_flags || sqe->splice_fd_in) 419 return -EINVAL; 420 421 up->offset = READ_ONCE(sqe->off); 422 up->nr_args = READ_ONCE(sqe->len); 423 if (!up->nr_args) 424 return -EINVAL; 425 up->arg = READ_ONCE(sqe->addr); 426 return 0; 427 } 428 429 static int io_files_update_with_index_alloc(struct io_kiocb *req, 430 unsigned int issue_flags) 431 { 432 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 433 __s32 __user *fds = u64_to_user_ptr(up->arg); 434 unsigned int done; 435 struct file *file; 436 int ret, fd; 437 438 if (!req->ctx->file_table.data.nr) 439 return -ENXIO; 440 441 for (done = 0; done < up->nr_args; done++) { 442 if (get_user(fd, &fds[done])) { 443 ret = -EFAULT; 444 break; 445 } 446 447 file = fget(fd); 448 if (!file) { 449 ret = -EBADF; 450 break; 451 } 452 ret = io_fixed_fd_install(req, issue_flags, file, 453 IORING_FILE_INDEX_ALLOC); 454 if (ret < 0) 455 break; 456 if (put_user(ret, &fds[done])) { 457 __io_close_fixed(req->ctx, issue_flags, ret); 458 ret = -EFAULT; 459 break; 460 } 461 } 462 463 if (done) 464 return done; 465 return ret; 466 } 467 468 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 469 { 470 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 471 struct io_ring_ctx *ctx = req->ctx; 472 struct io_uring_rsrc_update2 up2; 473 int ret; 474 475 up2.offset = up->offset; 476 up2.data = up->arg; 477 up2.nr = 0; 478 up2.tags = 0; 479 up2.resv = 0; 480 up2.resv2 = 0; 481 482 if (up->offset == IORING_FILE_INDEX_ALLOC) { 483 ret = io_files_update_with_index_alloc(req, issue_flags); 484 } else { 485 io_ring_submit_lock(ctx, issue_flags); 486 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 487 &up2, up->nr_args); 488 io_ring_submit_unlock(ctx, issue_flags); 489 } 490 491 if (ret < 0) 492 req_set_fail(req); 493 io_req_set_res(req, ret, 0); 494 return IOU_COMPLETE; 495 } 496 497 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 498 { 499 if (node->tag) 500 io_post_aux_cqe(ctx, node->tag, 0, 0); 501 502 switch (node->type) { 503 case IORING_RSRC_FILE: 504 fput(io_slot_file(node)); 505 break; 506 case IORING_RSRC_BUFFER: 507 io_buffer_unmap(ctx, node->buf); 508 break; 509 default: 510 WARN_ON_ONCE(1); 511 break; 512 } 513 514 io_cache_free(&ctx->node_cache, node); 515 } 516 517 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 518 { 519 if (!ctx->file_table.data.nr) 520 return -ENXIO; 521 522 io_free_file_tables(ctx, &ctx->file_table); 523 io_file_table_set_alloc_range(ctx, 0, 0); 524 return 0; 525 } 526 527 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 528 unsigned nr_args, u64 __user *tags) 529 { 530 __s32 __user *fds = (__s32 __user *) arg; 531 struct file *file; 532 int fd, ret; 533 unsigned i; 534 535 if (ctx->file_table.data.nr) 536 return -EBUSY; 537 if (!nr_args) 538 return -EINVAL; 539 if (nr_args > IORING_MAX_FIXED_FILES) 540 return -EMFILE; 541 if (nr_args > rlimit(RLIMIT_NOFILE)) 542 return -EMFILE; 543 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 544 return -ENOMEM; 545 546 for (i = 0; i < nr_args; i++) { 547 struct io_rsrc_node *node; 548 u64 tag = 0; 549 550 ret = -EFAULT; 551 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 552 goto fail; 553 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 554 goto fail; 555 /* allow sparse sets */ 556 if (!fds || fd == -1) { 557 ret = -EINVAL; 558 if (tag) 559 goto fail; 560 continue; 561 } 562 563 file = fget(fd); 564 ret = -EBADF; 565 if (unlikely(!file)) 566 goto fail; 567 568 /* 569 * Don't allow io_uring instances to be registered. 570 */ 571 if (io_is_uring_fops(file)) { 572 fput(file); 573 goto fail; 574 } 575 ret = -ENOMEM; 576 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 577 if (!node) { 578 fput(file); 579 goto fail; 580 } 581 if (tag) 582 node->tag = tag; 583 ctx->file_table.data.nodes[i] = node; 584 io_fixed_file_set(node, file); 585 io_file_bitmap_set(&ctx->file_table, i); 586 } 587 588 /* default it to the whole table */ 589 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 590 return 0; 591 fail: 592 io_clear_table_tags(&ctx->file_table.data); 593 io_sqe_files_unregister(ctx); 594 return ret; 595 } 596 597 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 598 { 599 if (!ctx->buf_table.nr) 600 return -ENXIO; 601 io_rsrc_data_free(ctx, &ctx->buf_table); 602 return 0; 603 } 604 605 /* 606 * Not super efficient, but this is just a registration time. And we do cache 607 * the last compound head, so generally we'll only do a full search if we don't 608 * match that one. 609 * 610 * We check if the given compound head page has already been accounted, to 611 * avoid double accounting it. This allows us to account the full size of the 612 * page, not just the constituent pages of a huge page. 613 */ 614 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 615 int nr_pages, struct page *hpage) 616 { 617 int i, j; 618 619 /* check current page array */ 620 for (i = 0; i < nr_pages; i++) { 621 if (!PageCompound(pages[i])) 622 continue; 623 if (compound_head(pages[i]) == hpage) 624 return true; 625 } 626 627 /* check previously registered pages */ 628 for (i = 0; i < ctx->buf_table.nr; i++) { 629 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 630 struct io_mapped_ubuf *imu; 631 632 if (!node) 633 continue; 634 imu = node->buf; 635 for (j = 0; j < imu->nr_bvecs; j++) { 636 if (!PageCompound(imu->bvec[j].bv_page)) 637 continue; 638 if (compound_head(imu->bvec[j].bv_page) == hpage) 639 return true; 640 } 641 } 642 643 return false; 644 } 645 646 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 647 int nr_pages, struct io_mapped_ubuf *imu, 648 struct page **last_hpage) 649 { 650 int i, ret; 651 652 imu->acct_pages = 0; 653 for (i = 0; i < nr_pages; i++) { 654 if (!PageCompound(pages[i])) { 655 imu->acct_pages++; 656 } else { 657 struct page *hpage; 658 659 hpage = compound_head(pages[i]); 660 if (hpage == *last_hpage) 661 continue; 662 *last_hpage = hpage; 663 if (headpage_already_acct(ctx, pages, i, hpage)) 664 continue; 665 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 666 } 667 } 668 669 if (!imu->acct_pages) 670 return 0; 671 672 ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages); 673 if (ret) 674 imu->acct_pages = 0; 675 return ret; 676 } 677 678 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 679 struct io_imu_folio_data *data) 680 { 681 struct page **page_array = *pages, **new_array = NULL; 682 unsigned nr_pages_left = *nr_pages; 683 unsigned nr_folios = data->nr_folios; 684 unsigned i, j; 685 686 /* Store head pages only*/ 687 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); 688 if (!new_array) 689 return false; 690 691 for (i = 0, j = 0; i < nr_folios; i++) { 692 struct page *p = compound_head(page_array[j]); 693 struct folio *folio = page_folio(p); 694 unsigned int nr; 695 696 WARN_ON_ONCE(i > 0 && p != page_array[j]); 697 698 nr = i ? data->nr_pages_mid : data->nr_pages_head; 699 nr = min(nr, nr_pages_left); 700 /* Drop all but one ref, the entire folio will remain pinned. */ 701 if (nr > 1) 702 unpin_user_folio(folio, nr - 1); 703 j += nr; 704 nr_pages_left -= nr; 705 new_array[i] = p; 706 } 707 708 WARN_ON_ONCE(j != *nr_pages); 709 710 kvfree(page_array); 711 *pages = new_array; 712 *nr_pages = nr_folios; 713 return true; 714 } 715 716 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 717 struct io_imu_folio_data *data) 718 { 719 struct folio *folio = page_folio(page_array[0]); 720 unsigned int count = 1, nr_folios = 1; 721 int i; 722 723 data->nr_pages_mid = folio_nr_pages(folio); 724 data->folio_shift = folio_shift(folio); 725 data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); 726 727 /* 728 * Check if pages are contiguous inside a folio, and all folios have 729 * the same page count except for the head and tail. 730 */ 731 for (i = 1; i < nr_pages; i++) { 732 if (page_folio(page_array[i]) == folio && 733 page_array[i] == page_array[i-1] + 1) { 734 count++; 735 continue; 736 } 737 738 if (nr_folios == 1) { 739 if (folio_page_idx(folio, page_array[i-1]) != 740 data->nr_pages_mid - 1) 741 return false; 742 743 data->nr_pages_head = count; 744 } else if (count != data->nr_pages_mid) { 745 return false; 746 } 747 748 folio = page_folio(page_array[i]); 749 if (folio_size(folio) != (1UL << data->folio_shift) || 750 folio_page_idx(folio, page_array[i]) != 0) 751 return false; 752 753 count = 1; 754 nr_folios++; 755 } 756 if (nr_folios == 1) 757 data->nr_pages_head = count; 758 759 data->nr_folios = nr_folios; 760 return true; 761 } 762 763 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 764 struct iovec *iov, 765 struct page **last_hpage) 766 { 767 struct io_mapped_ubuf *imu = NULL; 768 struct page **pages = NULL; 769 struct io_rsrc_node *node; 770 unsigned long off; 771 size_t size; 772 int ret, nr_pages, i; 773 struct io_imu_folio_data data; 774 bool coalesced = false; 775 776 if (!iov->iov_base) { 777 if (iov->iov_len) 778 return ERR_PTR(-EFAULT); 779 /* remove the buffer without installing a new one */ 780 return NULL; 781 } 782 783 ret = io_validate_user_buf_range((unsigned long)iov->iov_base, 784 iov->iov_len); 785 if (ret) 786 return ERR_PTR(ret); 787 788 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 789 if (!node) 790 return ERR_PTR(-ENOMEM); 791 792 ret = -ENOMEM; 793 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 794 &nr_pages); 795 if (IS_ERR(pages)) { 796 ret = PTR_ERR(pages); 797 pages = NULL; 798 goto done; 799 } 800 801 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 802 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 803 if (data.nr_pages_mid != 1) 804 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 805 } 806 807 imu = io_alloc_imu(ctx, nr_pages); 808 if (!imu) 809 goto done; 810 811 imu->nr_bvecs = nr_pages; 812 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 813 if (ret) 814 goto done; 815 816 size = iov->iov_len; 817 /* store original address for later verification */ 818 imu->ubuf = (unsigned long) iov->iov_base; 819 imu->len = iov->iov_len; 820 imu->folio_shift = PAGE_SHIFT; 821 imu->release = io_release_ubuf; 822 imu->priv = imu; 823 imu->flags = 0; 824 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 825 if (coalesced) 826 imu->folio_shift = data.folio_shift; 827 refcount_set(&imu->refs, 1); 828 829 off = (unsigned long)iov->iov_base & ~PAGE_MASK; 830 if (coalesced) 831 off += data.first_folio_page_idx << PAGE_SHIFT; 832 833 node->buf = imu; 834 ret = 0; 835 836 for (i = 0; i < nr_pages; i++) { 837 size_t vec_len; 838 839 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 840 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 841 off = 0; 842 size -= vec_len; 843 } 844 done: 845 if (ret) { 846 if (imu) 847 io_free_imu(ctx, imu); 848 if (pages) { 849 for (i = 0; i < nr_pages; i++) 850 unpin_user_folio(page_folio(pages[i]), 1); 851 } 852 io_cache_free(&ctx->node_cache, node); 853 node = ERR_PTR(ret); 854 } 855 kvfree(pages); 856 return node; 857 } 858 859 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 860 unsigned int nr_args, u64 __user *tags) 861 { 862 struct page *last_hpage = NULL; 863 struct io_rsrc_data data; 864 struct iovec fast_iov, *iov = &fast_iov; 865 const struct iovec __user *uvec; 866 int i, ret; 867 868 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 869 870 if (ctx->buf_table.nr) 871 return -EBUSY; 872 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 873 return -EINVAL; 874 ret = io_rsrc_data_alloc(&data, nr_args); 875 if (ret) 876 return ret; 877 878 if (!arg) 879 memset(iov, 0, sizeof(*iov)); 880 881 for (i = 0; i < nr_args; i++) { 882 struct io_rsrc_node *node; 883 u64 tag = 0; 884 885 if (arg) { 886 uvec = (struct iovec __user *) arg; 887 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 888 if (IS_ERR(iov)) { 889 ret = PTR_ERR(iov); 890 break; 891 } 892 if (ctx->compat) 893 arg += sizeof(struct compat_iovec); 894 else 895 arg += sizeof(struct iovec); 896 } 897 898 if (tags) { 899 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 900 ret = -EFAULT; 901 break; 902 } 903 } 904 905 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 906 if (IS_ERR(node)) { 907 ret = PTR_ERR(node); 908 break; 909 } 910 if (tag) { 911 if (!node) { 912 ret = -EINVAL; 913 break; 914 } 915 node->tag = tag; 916 } 917 data.nodes[i] = node; 918 } 919 920 ctx->buf_table = data; 921 if (ret) { 922 io_clear_table_tags(&ctx->buf_table); 923 io_sqe_buffers_unregister(ctx); 924 } 925 return ret; 926 } 927 928 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 929 void (*release)(void *), unsigned int index, 930 unsigned int issue_flags) 931 { 932 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 933 struct io_rsrc_data *data = &ctx->buf_table; 934 struct req_iterator rq_iter; 935 struct io_mapped_ubuf *imu; 936 struct io_rsrc_node *node; 937 struct bio_vec bv; 938 unsigned int nr_bvecs = 0; 939 int ret = 0; 940 941 io_ring_submit_lock(ctx, issue_flags); 942 if (index >= data->nr) { 943 ret = -EINVAL; 944 goto unlock; 945 } 946 index = array_index_nospec(index, data->nr); 947 948 if (data->nodes[index]) { 949 ret = -EBUSY; 950 goto unlock; 951 } 952 953 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 954 if (!node) { 955 ret = -ENOMEM; 956 goto unlock; 957 } 958 959 /* 960 * blk_rq_nr_phys_segments() may overestimate the number of bvecs 961 * but avoids needing to iterate over the bvecs 962 */ 963 imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); 964 if (!imu) { 965 kfree(node); 966 ret = -ENOMEM; 967 goto unlock; 968 } 969 970 imu->ubuf = 0; 971 imu->len = blk_rq_bytes(rq); 972 imu->acct_pages = 0; 973 imu->folio_shift = PAGE_SHIFT; 974 refcount_set(&imu->refs, 1); 975 imu->release = release; 976 imu->priv = rq; 977 imu->flags = IO_REGBUF_F_KBUF; 978 imu->dir = 1 << rq_data_dir(rq); 979 980 rq_for_each_bvec(bv, rq, rq_iter) 981 imu->bvec[nr_bvecs++] = bv; 982 imu->nr_bvecs = nr_bvecs; 983 984 node->buf = imu; 985 data->nodes[index] = node; 986 unlock: 987 io_ring_submit_unlock(ctx, issue_flags); 988 return ret; 989 } 990 EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 991 992 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 993 unsigned int issue_flags) 994 { 995 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 996 struct io_rsrc_data *data = &ctx->buf_table; 997 struct io_rsrc_node *node; 998 int ret = 0; 999 1000 io_ring_submit_lock(ctx, issue_flags); 1001 if (index >= data->nr) { 1002 ret = -EINVAL; 1003 goto unlock; 1004 } 1005 index = array_index_nospec(index, data->nr); 1006 1007 node = data->nodes[index]; 1008 if (!node) { 1009 ret = -EINVAL; 1010 goto unlock; 1011 } 1012 if (!(node->buf->flags & IO_REGBUF_F_KBUF)) { 1013 ret = -EBUSY; 1014 goto unlock; 1015 } 1016 1017 io_put_rsrc_node(ctx, node); 1018 data->nodes[index] = NULL; 1019 unlock: 1020 io_ring_submit_unlock(ctx, issue_flags); 1021 return ret; 1022 } 1023 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 1024 1025 static int validate_fixed_range(u64 buf_addr, size_t len, 1026 const struct io_mapped_ubuf *imu) 1027 { 1028 u64 buf_end; 1029 1030 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1031 return -EFAULT; 1032 /* not inside the mapped region */ 1033 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1034 return -EFAULT; 1035 if (unlikely(len > MAX_RW_COUNT)) 1036 return -EFAULT; 1037 return 0; 1038 } 1039 1040 static int io_import_kbuf(int ddir, struct iov_iter *iter, 1041 struct io_mapped_ubuf *imu, size_t len, size_t offset) 1042 { 1043 size_t count = len + offset; 1044 1045 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); 1046 iov_iter_advance(iter, offset); 1047 return 0; 1048 } 1049 1050 static int io_import_fixed(int ddir, struct iov_iter *iter, 1051 struct io_mapped_ubuf *imu, 1052 u64 buf_addr, size_t len) 1053 { 1054 const struct bio_vec *bvec; 1055 size_t folio_mask; 1056 unsigned nr_segs; 1057 size_t offset; 1058 int ret; 1059 1060 ret = validate_fixed_range(buf_addr, len, imu); 1061 if (unlikely(ret)) 1062 return ret; 1063 if (!(imu->dir & (1 << ddir))) 1064 return -EFAULT; 1065 1066 offset = buf_addr - imu->ubuf; 1067 1068 if (imu->flags & IO_REGBUF_F_KBUF) 1069 return io_import_kbuf(ddir, iter, imu, len, offset); 1070 1071 /* 1072 * Don't use iov_iter_advance() here, as it's really slow for 1073 * using the latter parts of a big fixed buffer - it iterates 1074 * over each segment manually. We can cheat a bit here for user 1075 * registered nodes, because we know that: 1076 * 1077 * 1) it's a BVEC iter, we set it up 1078 * 2) all bvecs are the same in size, except potentially the 1079 * first and last bvec 1080 */ 1081 folio_mask = (1UL << imu->folio_shift) - 1; 1082 bvec = imu->bvec; 1083 if (offset >= bvec->bv_len) { 1084 unsigned long seg_skip; 1085 1086 /* skip first vec */ 1087 offset -= bvec->bv_len; 1088 seg_skip = 1 + (offset >> imu->folio_shift); 1089 bvec += seg_skip; 1090 offset &= folio_mask; 1091 } 1092 nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; 1093 iov_iter_bvec(iter, ddir, bvec, nr_segs, len); 1094 iter->iov_offset = offset; 1095 return 0; 1096 } 1097 1098 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 1099 unsigned issue_flags) 1100 { 1101 struct io_ring_ctx *ctx = req->ctx; 1102 struct io_rsrc_node *node; 1103 1104 if (req->flags & REQ_F_BUF_NODE) 1105 return req->buf_node; 1106 req->flags |= REQ_F_BUF_NODE; 1107 1108 io_ring_submit_lock(ctx, issue_flags); 1109 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 1110 if (node) { 1111 node->refs++; 1112 req->buf_node = node; 1113 io_ring_submit_unlock(ctx, issue_flags); 1114 return node; 1115 } 1116 req->flags &= ~REQ_F_BUF_NODE; 1117 io_ring_submit_unlock(ctx, issue_flags); 1118 return NULL; 1119 } 1120 1121 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 1122 u64 buf_addr, size_t len, int ddir, 1123 unsigned issue_flags) 1124 { 1125 struct io_rsrc_node *node; 1126 1127 node = io_find_buf_node(req, issue_flags); 1128 if (!node) 1129 return -EFAULT; 1130 return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1131 } 1132 1133 /* Lock two rings at once. The rings must be different! */ 1134 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 1135 { 1136 if (ctx1 > ctx2) 1137 swap(ctx1, ctx2); 1138 mutex_lock(&ctx1->uring_lock); 1139 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 1140 } 1141 1142 /* Both rings are locked by the caller. */ 1143 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 1144 struct io_uring_clone_buffers *arg) 1145 { 1146 struct io_rsrc_data data; 1147 int i, ret, off, nr; 1148 unsigned int nbufs; 1149 1150 lockdep_assert_held(&ctx->uring_lock); 1151 lockdep_assert_held(&src_ctx->uring_lock); 1152 1153 /* 1154 * Accounting state is shared between the two rings; that only works if 1155 * both rings are accounted towards the same counters. 1156 */ 1157 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 1158 return -EINVAL; 1159 1160 /* if offsets are given, must have nr specified too */ 1161 if (!arg->nr && (arg->dst_off || arg->src_off)) 1162 return -EINVAL; 1163 /* not allowed unless REPLACE is set */ 1164 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 1165 return -EBUSY; 1166 1167 nbufs = src_ctx->buf_table.nr; 1168 if (!nbufs) 1169 return -ENXIO; 1170 if (!arg->nr) 1171 arg->nr = nbufs; 1172 else if (arg->nr > nbufs) 1173 return -EINVAL; 1174 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1175 return -EINVAL; 1176 if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs) 1177 return -EOVERFLOW; 1178 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1179 return -EOVERFLOW; 1180 if (nbufs > IORING_MAX_REG_BUFFERS) 1181 return -EINVAL; 1182 1183 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 1184 if (ret) 1185 return ret; 1186 1187 /* Copy original dst nodes from before the cloned range */ 1188 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1189 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 1190 1191 if (node) { 1192 data.nodes[i] = node; 1193 node->refs++; 1194 } 1195 } 1196 1197 off = arg->dst_off; 1198 i = arg->src_off; 1199 nr = arg->nr; 1200 while (nr--) { 1201 struct io_rsrc_node *dst_node, *src_node; 1202 1203 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1204 if (!src_node) { 1205 dst_node = NULL; 1206 } else { 1207 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1208 if (!dst_node) { 1209 io_rsrc_data_free(ctx, &data); 1210 return -ENOMEM; 1211 } 1212 1213 refcount_inc(&src_node->buf->refs); 1214 dst_node->buf = src_node->buf; 1215 } 1216 data.nodes[off++] = dst_node; 1217 i++; 1218 } 1219 1220 /* Copy original dst nodes from after the cloned range */ 1221 for (i = nbufs; i < ctx->buf_table.nr; i++) { 1222 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 1223 1224 if (node) { 1225 data.nodes[i] = node; 1226 node->refs++; 1227 } 1228 } 1229 1230 /* 1231 * If asked for replace, put the old table. data->nodes[] holds both 1232 * old and new nodes at this point. 1233 */ 1234 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1235 io_rsrc_data_free(ctx, &ctx->buf_table); 1236 1237 /* 1238 * ctx->buf_table must be empty now - either the contents are being 1239 * replaced and we just freed the table, or the contents are being 1240 * copied to a ring that does not have buffers yet (checked at function 1241 * entry). 1242 */ 1243 WARN_ON_ONCE(ctx->buf_table.nr); 1244 ctx->buf_table = data; 1245 return 0; 1246 } 1247 1248 /* 1249 * Copy the registered buffers from the source ring whose file descriptor 1250 * is given in the src_fd to the current ring. This is identical to registering 1251 * the buffers with ctx, except faster as mappings already exist. 1252 * 1253 * Since the memory is already accounted once, don't account it again. 1254 */ 1255 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1256 { 1257 struct io_uring_clone_buffers buf; 1258 struct io_ring_ctx *src_ctx; 1259 bool registered_src; 1260 struct file *file; 1261 int ret; 1262 1263 if (copy_from_user(&buf, arg, sizeof(buf))) 1264 return -EFAULT; 1265 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1266 return -EINVAL; 1267 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1268 return -EBUSY; 1269 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1270 return -EINVAL; 1271 1272 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1273 file = io_uring_register_get_file(buf.src_fd, registered_src); 1274 if (IS_ERR(file)) 1275 return PTR_ERR(file); 1276 1277 src_ctx = file->private_data; 1278 if (src_ctx != ctx) { 1279 mutex_unlock(&ctx->uring_lock); 1280 lock_two_rings(ctx, src_ctx); 1281 1282 if (src_ctx->submitter_task && 1283 src_ctx->submitter_task != current) { 1284 ret = -EEXIST; 1285 goto out; 1286 } 1287 } 1288 1289 ret = io_clone_buffers(ctx, src_ctx, &buf); 1290 1291 out: 1292 if (src_ctx != ctx) 1293 mutex_unlock(&src_ctx->uring_lock); 1294 1295 fput(file); 1296 return ret; 1297 } 1298 1299 void io_vec_free(struct iou_vec *iv) 1300 { 1301 if (!iv->iovec) 1302 return; 1303 kfree(iv->iovec); 1304 iv->iovec = NULL; 1305 iv->nr = 0; 1306 } 1307 1308 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1309 { 1310 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_NOWARN; 1311 struct iovec *iov; 1312 1313 iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); 1314 if (!iov) 1315 return -ENOMEM; 1316 1317 io_vec_free(iv); 1318 iv->iovec = iov; 1319 iv->nr = nr_entries; 1320 return 0; 1321 } 1322 1323 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, 1324 struct io_mapped_ubuf *imu, 1325 struct iovec *iovec, unsigned nr_iovs, 1326 struct iou_vec *vec) 1327 { 1328 unsigned long folio_size = 1 << imu->folio_shift; 1329 unsigned long folio_mask = folio_size - 1; 1330 struct bio_vec *res_bvec = vec->bvec; 1331 size_t total_len = 0; 1332 unsigned bvec_idx = 0; 1333 unsigned iov_idx; 1334 1335 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1336 size_t iov_len = iovec[iov_idx].iov_len; 1337 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; 1338 struct bio_vec *src_bvec; 1339 size_t offset; 1340 int ret; 1341 1342 ret = validate_fixed_range(buf_addr, iov_len, imu); 1343 if (unlikely(ret)) 1344 return ret; 1345 1346 if (unlikely(!iov_len)) 1347 return -EFAULT; 1348 if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) 1349 return -EOVERFLOW; 1350 1351 offset = buf_addr - imu->ubuf; 1352 /* 1353 * Only the first bvec can have non zero bv_offset, account it 1354 * here and work with full folios below. 1355 */ 1356 offset += imu->bvec[0].bv_offset; 1357 1358 src_bvec = imu->bvec + (offset >> imu->folio_shift); 1359 offset &= folio_mask; 1360 1361 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { 1362 size_t seg_size = min_t(size_t, iov_len, 1363 folio_size - offset); 1364 1365 bvec_set_page(&res_bvec[bvec_idx], 1366 src_bvec->bv_page, seg_size, offset); 1367 iov_len -= seg_size; 1368 } 1369 } 1370 if (total_len > MAX_RW_COUNT) 1371 return -EINVAL; 1372 1373 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); 1374 return 0; 1375 } 1376 1377 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, 1378 struct io_mapped_ubuf *imu) 1379 { 1380 unsigned shift = imu->folio_shift; 1381 size_t max_segs = 0; 1382 unsigned i; 1383 1384 for (i = 0; i < nr_iovs; i++) { 1385 max_segs += (iov[i].iov_len >> shift) + 2; 1386 if (max_segs > INT_MAX) 1387 return -EOVERFLOW; 1388 } 1389 return max_segs; 1390 } 1391 1392 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, 1393 struct io_mapped_ubuf *imu, 1394 struct iovec *iovec, unsigned nr_iovs, 1395 struct iou_vec *vec) 1396 { 1397 const struct bio_vec *src_bvec = imu->bvec; 1398 struct bio_vec *res_bvec = vec->bvec; 1399 unsigned res_idx = 0; 1400 size_t total_len = 0; 1401 unsigned iov_idx; 1402 1403 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1404 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; 1405 size_t iov_len = iovec[iov_idx].iov_len; 1406 struct bvec_iter bi = { 1407 .bi_size = offset + iov_len, 1408 }; 1409 struct bio_vec bv; 1410 1411 bvec_iter_advance(src_bvec, &bi, offset); 1412 for_each_mp_bvec(bv, src_bvec, bi, bi) 1413 res_bvec[res_idx++] = bv; 1414 total_len += iov_len; 1415 } 1416 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); 1417 return 0; 1418 } 1419 1420 static int iov_kern_bvec_size(const struct iovec *iov, 1421 const struct io_mapped_ubuf *imu, 1422 unsigned int *nr_seg) 1423 { 1424 size_t offset = (size_t)(uintptr_t)iov->iov_base; 1425 const struct bio_vec *bvec = imu->bvec; 1426 int start = 0, i = 0; 1427 size_t off = 0; 1428 int ret; 1429 1430 ret = validate_fixed_range(offset, iov->iov_len, imu); 1431 if (unlikely(ret)) 1432 return ret; 1433 1434 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; 1435 off += bvec[i].bv_len, i++) { 1436 if (offset >= off && offset < off + bvec[i].bv_len) 1437 start = i; 1438 } 1439 *nr_seg = i - start; 1440 return 0; 1441 } 1442 1443 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, 1444 struct io_mapped_ubuf *imu, unsigned *nr_segs) 1445 { 1446 unsigned max_segs = 0; 1447 size_t total_len = 0; 1448 unsigned i; 1449 int ret; 1450 1451 *nr_segs = 0; 1452 for (i = 0; i < nr_iovs; i++) { 1453 if (unlikely(!iov[i].iov_len)) 1454 return -EFAULT; 1455 if (unlikely(check_add_overflow(total_len, iov[i].iov_len, 1456 &total_len))) 1457 return -EOVERFLOW; 1458 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); 1459 if (unlikely(ret)) 1460 return ret; 1461 *nr_segs += max_segs; 1462 } 1463 if (total_len > MAX_RW_COUNT) 1464 return -EINVAL; 1465 return 0; 1466 } 1467 1468 int io_import_reg_vec(int ddir, struct iov_iter *iter, 1469 struct io_kiocb *req, struct iou_vec *vec, 1470 unsigned nr_iovs, unsigned issue_flags) 1471 { 1472 struct io_rsrc_node *node; 1473 struct io_mapped_ubuf *imu; 1474 unsigned iovec_off; 1475 struct iovec *iov; 1476 unsigned nr_segs; 1477 1478 node = io_find_buf_node(req, issue_flags); 1479 if (!node) 1480 return -EFAULT; 1481 imu = node->buf; 1482 if (!(imu->dir & (1 << ddir))) 1483 return -EFAULT; 1484 1485 iovec_off = vec->nr - nr_iovs; 1486 iov = vec->iovec + iovec_off; 1487 1488 if (imu->flags & IO_REGBUF_F_KBUF) { 1489 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); 1490 1491 if (unlikely(ret)) 1492 return ret; 1493 } else { 1494 int ret = io_estimate_bvec_size(iov, nr_iovs, imu); 1495 1496 if (ret < 0) 1497 return ret; 1498 nr_segs = ret; 1499 } 1500 1501 if (sizeof(struct bio_vec) > sizeof(struct iovec)) { 1502 size_t bvec_bytes; 1503 1504 bvec_bytes = nr_segs * sizeof(struct bio_vec); 1505 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); 1506 nr_segs += nr_iovs; 1507 } 1508 1509 if (nr_segs > vec->nr) { 1510 struct iou_vec tmp_vec = {}; 1511 int ret; 1512 1513 ret = io_vec_realloc(&tmp_vec, nr_segs); 1514 if (ret) 1515 return ret; 1516 1517 iovec_off = tmp_vec.nr - nr_iovs; 1518 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); 1519 io_vec_free(vec); 1520 1521 *vec = tmp_vec; 1522 iov = vec->iovec + iovec_off; 1523 req->flags |= REQ_F_NEED_CLEANUP; 1524 } 1525 1526 if (imu->flags & IO_REGBUF_F_KBUF) 1527 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1528 1529 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1530 } 1531 1532 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, 1533 const struct iovec __user *uvec, size_t uvec_segs) 1534 { 1535 struct iovec *iov; 1536 int iovec_off, ret; 1537 void *res; 1538 1539 if (uvec_segs > iv->nr) { 1540 ret = io_vec_realloc(iv, uvec_segs); 1541 if (ret) 1542 return ret; 1543 req->flags |= REQ_F_NEED_CLEANUP; 1544 } 1545 1546 /* pad iovec to the right */ 1547 iovec_off = iv->nr - uvec_segs; 1548 iov = iv->iovec + iovec_off; 1549 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, 1550 io_is_compat(req->ctx)); 1551 if (IS_ERR(res)) 1552 return PTR_ERR(res); 1553 1554 req->flags |= REQ_F_IMPORT_BUFFER; 1555 return 0; 1556 } 1557