1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 #include <linux/io_uring/cmd.h> 13 14 #include <uapi/linux/io_uring.h> 15 16 #include "filetable.h" 17 #include "io_uring.h" 18 #include "openclose.h" 19 #include "rsrc.h" 20 #include "memmap.h" 21 #include "register.h" 22 23 struct io_rsrc_update { 24 struct file *file; 25 u64 arg; 26 u32 nr_args; 27 u32 offset; 28 }; 29 30 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 31 struct iovec *iov, struct page **last_hpage); 32 33 /* only define max */ 34 #define IORING_MAX_FIXED_FILES (1U << 20) 35 #define IORING_MAX_REG_BUFFERS (1U << 14) 36 37 #define IO_CACHED_BVECS_SEGS 32 38 39 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 40 { 41 unsigned long page_limit, cur_pages, new_pages; 42 43 if (!nr_pages) 44 return 0; 45 46 /* Don't allow more pages than we can safely lock */ 47 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 48 49 cur_pages = atomic_long_read(&user->locked_vm); 50 do { 51 new_pages = cur_pages + nr_pages; 52 if (new_pages > page_limit) 53 return -ENOMEM; 54 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 55 &cur_pages, new_pages)); 56 return 0; 57 } 58 59 void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, 60 unsigned long nr_pages) 61 { 62 if (user) 63 __io_unaccount_mem(user, nr_pages); 64 65 if (mm_account) 66 atomic64_sub(nr_pages, &mm_account->pinned_vm); 67 } 68 69 int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, 70 unsigned long nr_pages) 71 { 72 int ret; 73 74 if (user) { 75 ret = __io_account_mem(user, nr_pages); 76 if (ret) 77 return ret; 78 } 79 80 if (mm_account) 81 atomic64_add(nr_pages, &mm_account->pinned_vm); 82 83 return 0; 84 } 85 86 int io_validate_user_buf_range(u64 uaddr, u64 ulen) 87 { 88 unsigned long tmp, base = (unsigned long)uaddr; 89 unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); 90 91 /* arbitrary limit, but we need something */ 92 if (ulen > SZ_1G || !ulen) 93 return -EFAULT; 94 if (check_add_overflow(base, acct_len, &tmp)) 95 return -EOVERFLOW; 96 return 0; 97 } 98 99 static void io_release_ubuf(void *priv) 100 { 101 struct io_mapped_ubuf *imu = priv; 102 unsigned int i; 103 104 for (i = 0; i < imu->nr_bvecs; i++) { 105 struct folio *folio = page_folio(imu->bvec[i].bv_page); 106 107 unpin_user_folio(folio, 1); 108 } 109 } 110 111 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 112 int nr_bvecs) 113 { 114 if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 115 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 116 return kvmalloc_flex(struct io_mapped_ubuf, bvec, nr_bvecs, GFP_KERNEL); 117 } 118 119 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 120 { 121 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 122 io_cache_free(&ctx->imu_cache, imu); 123 else 124 kvfree(imu); 125 } 126 127 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 128 { 129 if (unlikely(refcount_read(&imu->refs) > 1)) { 130 if (!refcount_dec_and_test(&imu->refs)) 131 return; 132 } 133 134 if (imu->acct_pages) 135 io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages); 136 imu->release(imu->priv); 137 io_free_imu(ctx, imu); 138 } 139 140 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 141 { 142 struct io_rsrc_node *node; 143 144 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 145 if (node) { 146 node->type = type; 147 node->refs = 1; 148 node->tag = 0; 149 node->file_ptr = 0; 150 } 151 return node; 152 } 153 154 bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 155 { 156 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 157 IO_CACHED_BVECS_SEGS); 158 const int node_size = sizeof(struct io_rsrc_node); 159 bool ret; 160 161 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 162 node_size, 0); 163 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 164 imu_cache_size, 0); 165 return ret; 166 } 167 168 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 169 { 170 io_alloc_cache_free(&ctx->node_cache, kfree); 171 io_alloc_cache_free(&ctx->imu_cache, kfree); 172 } 173 174 static void io_clear_table_tags(struct io_rsrc_data *data) 175 { 176 int i; 177 178 for (i = 0; i < data->nr; i++) { 179 struct io_rsrc_node *node = data->nodes[i]; 180 181 if (node) 182 node->tag = 0; 183 } 184 } 185 186 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 187 struct io_rsrc_data *data) 188 { 189 if (!data->nr) 190 return; 191 while (data->nr--) { 192 if (data->nodes[data->nr]) 193 io_put_rsrc_node(ctx, data->nodes[data->nr]); 194 } 195 kvfree(data->nodes); 196 data->nodes = NULL; 197 data->nr = 0; 198 } 199 200 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 201 { 202 data->nodes = kvmalloc_objs(struct io_rsrc_node *, nr, 203 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 204 if (data->nodes) { 205 data->nr = nr; 206 return 0; 207 } 208 return -ENOMEM; 209 } 210 211 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 212 struct io_uring_rsrc_update2 *up, 213 unsigned nr_args) 214 { 215 u64 __user *tags = u64_to_user_ptr(up->tags); 216 __s32 __user *fds = u64_to_user_ptr(up->data); 217 int fd, i, err = 0; 218 unsigned int done; 219 220 if (!ctx->file_table.data.nr) 221 return -ENXIO; 222 if (up->offset + nr_args > ctx->file_table.data.nr) 223 return -EINVAL; 224 225 for (done = 0; done < nr_args; done++) { 226 u64 tag = 0; 227 228 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 229 copy_from_user(&fd, &fds[done], sizeof(fd))) { 230 err = -EFAULT; 231 break; 232 } 233 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 234 err = -EINVAL; 235 break; 236 } 237 if (fd == IORING_REGISTER_FILES_SKIP) 238 continue; 239 240 i = up->offset + done; 241 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 242 io_file_bitmap_clear(&ctx->file_table, i); 243 244 if (fd != -1) { 245 struct file *file = fget(fd); 246 struct io_rsrc_node *node; 247 248 if (!file) { 249 err = -EBADF; 250 break; 251 } 252 /* 253 * Don't allow io_uring instances to be registered. 254 */ 255 if (io_is_uring_fops(file)) { 256 fput(file); 257 err = -EBADF; 258 break; 259 } 260 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 261 if (!node) { 262 err = -ENOMEM; 263 fput(file); 264 break; 265 } 266 ctx->file_table.data.nodes[i] = node; 267 if (tag) 268 node->tag = tag; 269 io_fixed_file_set(node, file); 270 io_file_bitmap_set(&ctx->file_table, i); 271 } 272 } 273 return done ? done : err; 274 } 275 276 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 277 struct io_uring_rsrc_update2 *up, 278 unsigned int nr_args) 279 { 280 u64 __user *tags = u64_to_user_ptr(up->tags); 281 struct iovec fast_iov, *iov; 282 struct page *last_hpage = NULL; 283 struct iovec __user *uvec; 284 u64 user_data = up->data; 285 __u32 done; 286 int i, err; 287 288 if (!ctx->buf_table.nr) 289 return -ENXIO; 290 if (up->offset + nr_args > ctx->buf_table.nr) 291 return -EINVAL; 292 293 for (done = 0; done < nr_args; done++) { 294 struct io_rsrc_node *node; 295 u64 tag = 0; 296 297 uvec = u64_to_user_ptr(user_data); 298 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 299 if (IS_ERR(iov)) { 300 err = PTR_ERR(iov); 301 break; 302 } 303 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 304 err = -EFAULT; 305 break; 306 } 307 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 308 if (IS_ERR(node)) { 309 err = PTR_ERR(node); 310 break; 311 } 312 if (tag) { 313 if (!node) { 314 err = -EINVAL; 315 break; 316 } 317 node->tag = tag; 318 } 319 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 320 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 321 ctx->buf_table.nodes[i] = node; 322 if (ctx->compat) 323 user_data += sizeof(struct compat_iovec); 324 else 325 user_data += sizeof(struct iovec); 326 } 327 return done ? done : err; 328 } 329 330 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 331 struct io_uring_rsrc_update2 *up, 332 unsigned nr_args) 333 { 334 __u32 tmp; 335 336 lockdep_assert_held(&ctx->uring_lock); 337 338 if (check_add_overflow(up->offset, nr_args, &tmp)) 339 return -EOVERFLOW; 340 341 switch (type) { 342 case IORING_RSRC_FILE: 343 return __io_sqe_files_update(ctx, up, nr_args); 344 case IORING_RSRC_BUFFER: 345 return __io_sqe_buffers_update(ctx, up, nr_args); 346 } 347 return -EINVAL; 348 } 349 350 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 351 unsigned nr_args) 352 { 353 struct io_uring_rsrc_update2 up; 354 355 if (!nr_args) 356 return -EINVAL; 357 memset(&up, 0, sizeof(up)); 358 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 359 return -EFAULT; 360 if (up.resv || up.resv2) 361 return -EINVAL; 362 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 363 } 364 365 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 366 unsigned size, unsigned type) 367 { 368 struct io_uring_rsrc_update2 up; 369 370 if (size != sizeof(up)) 371 return -EINVAL; 372 if (copy_from_user(&up, arg, sizeof(up))) 373 return -EFAULT; 374 if (!up.nr || up.resv || up.resv2) 375 return -EINVAL; 376 return __io_register_rsrc_update(ctx, type, &up, up.nr); 377 } 378 379 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 380 unsigned int size, unsigned int type) 381 { 382 struct io_uring_rsrc_register rr; 383 384 /* keep it extendible */ 385 if (size != sizeof(rr)) 386 return -EINVAL; 387 388 memset(&rr, 0, sizeof(rr)); 389 if (copy_from_user(&rr, arg, size)) 390 return -EFAULT; 391 if (!rr.nr || rr.resv2) 392 return -EINVAL; 393 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 394 return -EINVAL; 395 396 switch (type) { 397 case IORING_RSRC_FILE: 398 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 399 break; 400 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 401 rr.nr, u64_to_user_ptr(rr.tags)); 402 case IORING_RSRC_BUFFER: 403 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 404 break; 405 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 406 rr.nr, u64_to_user_ptr(rr.tags)); 407 } 408 return -EINVAL; 409 } 410 411 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 412 { 413 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 414 415 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 416 return -EINVAL; 417 if (sqe->rw_flags || sqe->splice_fd_in) 418 return -EINVAL; 419 420 up->offset = READ_ONCE(sqe->off); 421 up->nr_args = READ_ONCE(sqe->len); 422 if (!up->nr_args) 423 return -EINVAL; 424 up->arg = READ_ONCE(sqe->addr); 425 return 0; 426 } 427 428 static int io_files_update_with_index_alloc(struct io_kiocb *req, 429 unsigned int issue_flags) 430 { 431 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 432 __s32 __user *fds = u64_to_user_ptr(up->arg); 433 unsigned int done; 434 struct file *file; 435 int ret, fd; 436 437 if (!req->ctx->file_table.data.nr) 438 return -ENXIO; 439 440 for (done = 0; done < up->nr_args; done++) { 441 if (get_user(fd, &fds[done])) { 442 ret = -EFAULT; 443 break; 444 } 445 446 file = fget(fd); 447 if (!file) { 448 ret = -EBADF; 449 break; 450 } 451 ret = io_fixed_fd_install(req, issue_flags, file, 452 IORING_FILE_INDEX_ALLOC); 453 if (ret < 0) 454 break; 455 if (put_user(ret, &fds[done])) { 456 __io_close_fixed(req->ctx, issue_flags, ret); 457 ret = -EFAULT; 458 break; 459 } 460 } 461 462 if (done) 463 return done; 464 return ret; 465 } 466 467 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 468 { 469 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 470 struct io_ring_ctx *ctx = req->ctx; 471 struct io_uring_rsrc_update2 up2; 472 int ret; 473 474 up2.offset = up->offset; 475 up2.data = up->arg; 476 up2.nr = 0; 477 up2.tags = 0; 478 up2.resv = 0; 479 up2.resv2 = 0; 480 481 if (up->offset == IORING_FILE_INDEX_ALLOC) { 482 ret = io_files_update_with_index_alloc(req, issue_flags); 483 } else { 484 io_ring_submit_lock(ctx, issue_flags); 485 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 486 &up2, up->nr_args); 487 io_ring_submit_unlock(ctx, issue_flags); 488 } 489 490 if (ret < 0) 491 req_set_fail(req); 492 io_req_set_res(req, ret, 0); 493 return IOU_COMPLETE; 494 } 495 496 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 497 { 498 if (node->tag) 499 io_post_aux_cqe(ctx, node->tag, 0, 0); 500 501 switch (node->type) { 502 case IORING_RSRC_FILE: 503 fput(io_slot_file(node)); 504 break; 505 case IORING_RSRC_BUFFER: 506 io_buffer_unmap(ctx, node->buf); 507 break; 508 default: 509 WARN_ON_ONCE(1); 510 break; 511 } 512 513 io_cache_free(&ctx->node_cache, node); 514 } 515 516 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 517 { 518 if (!ctx->file_table.data.nr) 519 return -ENXIO; 520 521 io_free_file_tables(ctx, &ctx->file_table); 522 io_file_table_set_alloc_range(ctx, 0, 0); 523 return 0; 524 } 525 526 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 527 unsigned nr_args, u64 __user *tags) 528 { 529 __s32 __user *fds = (__s32 __user *) arg; 530 struct file *file; 531 int fd, ret; 532 unsigned i; 533 534 if (ctx->file_table.data.nr) 535 return -EBUSY; 536 if (!nr_args) 537 return -EINVAL; 538 if (nr_args > IORING_MAX_FIXED_FILES) 539 return -EMFILE; 540 if (nr_args > rlimit(RLIMIT_NOFILE)) 541 return -EMFILE; 542 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 543 return -ENOMEM; 544 545 for (i = 0; i < nr_args; i++) { 546 struct io_rsrc_node *node; 547 u64 tag = 0; 548 549 ret = -EFAULT; 550 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 551 goto fail; 552 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 553 goto fail; 554 /* allow sparse sets */ 555 if (!fds || fd == -1) { 556 ret = -EINVAL; 557 if (tag) 558 goto fail; 559 continue; 560 } 561 562 file = fget(fd); 563 ret = -EBADF; 564 if (unlikely(!file)) 565 goto fail; 566 567 /* 568 * Don't allow io_uring instances to be registered. 569 */ 570 if (io_is_uring_fops(file)) { 571 fput(file); 572 goto fail; 573 } 574 ret = -ENOMEM; 575 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 576 if (!node) { 577 fput(file); 578 goto fail; 579 } 580 if (tag) 581 node->tag = tag; 582 ctx->file_table.data.nodes[i] = node; 583 io_fixed_file_set(node, file); 584 io_file_bitmap_set(&ctx->file_table, i); 585 } 586 587 /* default it to the whole table */ 588 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 589 return 0; 590 fail: 591 io_clear_table_tags(&ctx->file_table.data); 592 io_sqe_files_unregister(ctx); 593 return ret; 594 } 595 596 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 597 { 598 if (!ctx->buf_table.nr) 599 return -ENXIO; 600 io_rsrc_data_free(ctx, &ctx->buf_table); 601 return 0; 602 } 603 604 /* 605 * Not super efficient, but this is just a registration time. And we do cache 606 * the last compound head, so generally we'll only do a full search if we don't 607 * match that one. 608 * 609 * We check if the given compound head page has already been accounted, to 610 * avoid double accounting it. This allows us to account the full size of the 611 * page, not just the constituent pages of a huge page. 612 */ 613 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 614 int nr_pages, struct page *hpage) 615 { 616 int i, j; 617 618 /* check current page array */ 619 for (i = 0; i < nr_pages; i++) { 620 if (!PageCompound(pages[i])) 621 continue; 622 if (compound_head(pages[i]) == hpage) 623 return true; 624 } 625 626 /* check previously registered pages */ 627 for (i = 0; i < ctx->buf_table.nr; i++) { 628 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 629 struct io_mapped_ubuf *imu; 630 631 if (!node) 632 continue; 633 imu = node->buf; 634 for (j = 0; j < imu->nr_bvecs; j++) { 635 if (!PageCompound(imu->bvec[j].bv_page)) 636 continue; 637 if (compound_head(imu->bvec[j].bv_page) == hpage) 638 return true; 639 } 640 } 641 642 return false; 643 } 644 645 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 646 int nr_pages, struct io_mapped_ubuf *imu, 647 struct page **last_hpage) 648 { 649 int i, ret; 650 651 imu->acct_pages = 0; 652 for (i = 0; i < nr_pages; i++) { 653 if (!PageCompound(pages[i])) { 654 imu->acct_pages++; 655 } else { 656 struct page *hpage; 657 658 hpage = compound_head(pages[i]); 659 if (hpage == *last_hpage) 660 continue; 661 *last_hpage = hpage; 662 if (headpage_already_acct(ctx, pages, i, hpage)) 663 continue; 664 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 665 } 666 } 667 668 if (!imu->acct_pages) 669 return 0; 670 671 ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages); 672 if (ret) 673 imu->acct_pages = 0; 674 return ret; 675 } 676 677 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 678 struct io_imu_folio_data *data) 679 { 680 struct page **page_array = *pages, **new_array = NULL; 681 unsigned nr_pages_left = *nr_pages; 682 unsigned nr_folios = data->nr_folios; 683 unsigned i, j; 684 685 /* Store head pages only*/ 686 new_array = kvmalloc_objs(struct page *, nr_folios, GFP_KERNEL); 687 if (!new_array) 688 return false; 689 690 for (i = 0, j = 0; i < nr_folios; i++) { 691 struct page *p = compound_head(page_array[j]); 692 struct folio *folio = page_folio(p); 693 unsigned int nr; 694 695 WARN_ON_ONCE(i > 0 && p != page_array[j]); 696 697 nr = i ? data->nr_pages_mid : data->nr_pages_head; 698 nr = min(nr, nr_pages_left); 699 /* Drop all but one ref, the entire folio will remain pinned. */ 700 if (nr > 1) 701 unpin_user_folio(folio, nr - 1); 702 j += nr; 703 nr_pages_left -= nr; 704 new_array[i] = p; 705 } 706 707 WARN_ON_ONCE(j != *nr_pages); 708 709 kvfree(page_array); 710 *pages = new_array; 711 *nr_pages = nr_folios; 712 return true; 713 } 714 715 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 716 struct io_imu_folio_data *data) 717 { 718 struct folio *folio = page_folio(page_array[0]); 719 unsigned int count = 1, nr_folios = 1; 720 int i; 721 722 data->nr_pages_mid = folio_nr_pages(folio); 723 data->folio_shift = folio_shift(folio); 724 data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); 725 726 /* 727 * Check if pages are contiguous inside a folio, and all folios have 728 * the same page count except for the head and tail. 729 */ 730 for (i = 1; i < nr_pages; i++) { 731 if (page_folio(page_array[i]) == folio && 732 page_array[i] == page_array[i-1] + 1) { 733 count++; 734 continue; 735 } 736 737 if (nr_folios == 1) { 738 if (folio_page_idx(folio, page_array[i-1]) != 739 data->nr_pages_mid - 1) 740 return false; 741 742 data->nr_pages_head = count; 743 } else if (count != data->nr_pages_mid) { 744 return false; 745 } 746 747 folio = page_folio(page_array[i]); 748 if (folio_size(folio) != (1UL << data->folio_shift) || 749 folio_page_idx(folio, page_array[i]) != 0) 750 return false; 751 752 count = 1; 753 nr_folios++; 754 } 755 if (nr_folios == 1) 756 data->nr_pages_head = count; 757 758 data->nr_folios = nr_folios; 759 return true; 760 } 761 762 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 763 struct iovec *iov, 764 struct page **last_hpage) 765 { 766 struct io_mapped_ubuf *imu = NULL; 767 struct page **pages = NULL; 768 struct io_rsrc_node *node; 769 unsigned long off; 770 size_t size; 771 int ret, nr_pages, i; 772 struct io_imu_folio_data data; 773 bool coalesced = false; 774 775 if (!iov->iov_base) { 776 if (iov->iov_len) 777 return ERR_PTR(-EFAULT); 778 /* remove the buffer without installing a new one */ 779 return NULL; 780 } 781 782 ret = io_validate_user_buf_range((unsigned long)iov->iov_base, 783 iov->iov_len); 784 if (ret) 785 return ERR_PTR(ret); 786 787 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 788 if (!node) 789 return ERR_PTR(-ENOMEM); 790 791 ret = -ENOMEM; 792 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 793 &nr_pages); 794 if (IS_ERR(pages)) { 795 ret = PTR_ERR(pages); 796 pages = NULL; 797 goto done; 798 } 799 800 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 801 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 802 if (data.nr_pages_mid != 1) 803 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 804 } 805 806 imu = io_alloc_imu(ctx, nr_pages); 807 if (!imu) 808 goto done; 809 810 imu->nr_bvecs = nr_pages; 811 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 812 if (ret) 813 goto done; 814 815 size = iov->iov_len; 816 /* store original address for later verification */ 817 imu->ubuf = (unsigned long) iov->iov_base; 818 imu->len = iov->iov_len; 819 imu->folio_shift = PAGE_SHIFT; 820 imu->release = io_release_ubuf; 821 imu->priv = imu; 822 imu->flags = 0; 823 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 824 if (coalesced) 825 imu->folio_shift = data.folio_shift; 826 refcount_set(&imu->refs, 1); 827 828 off = (unsigned long)iov->iov_base & ~PAGE_MASK; 829 if (coalesced) 830 off += data.first_folio_page_idx << PAGE_SHIFT; 831 832 node->buf = imu; 833 ret = 0; 834 835 for (i = 0; i < nr_pages; i++) { 836 size_t vec_len; 837 838 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 839 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 840 off = 0; 841 size -= vec_len; 842 } 843 done: 844 if (ret) { 845 if (imu) 846 io_free_imu(ctx, imu); 847 if (pages) { 848 for (i = 0; i < nr_pages; i++) 849 unpin_user_folio(page_folio(pages[i]), 1); 850 } 851 io_cache_free(&ctx->node_cache, node); 852 node = ERR_PTR(ret); 853 } 854 kvfree(pages); 855 return node; 856 } 857 858 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 859 unsigned int nr_args, u64 __user *tags) 860 { 861 struct page *last_hpage = NULL; 862 struct io_rsrc_data data; 863 struct iovec fast_iov, *iov = &fast_iov; 864 const struct iovec __user *uvec; 865 int i, ret; 866 867 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 868 869 if (ctx->buf_table.nr) 870 return -EBUSY; 871 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 872 return -EINVAL; 873 ret = io_rsrc_data_alloc(&data, nr_args); 874 if (ret) 875 return ret; 876 877 if (!arg) 878 memset(iov, 0, sizeof(*iov)); 879 880 for (i = 0; i < nr_args; i++) { 881 struct io_rsrc_node *node; 882 u64 tag = 0; 883 884 if (arg) { 885 uvec = (struct iovec __user *) arg; 886 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 887 if (IS_ERR(iov)) { 888 ret = PTR_ERR(iov); 889 break; 890 } 891 if (ctx->compat) 892 arg += sizeof(struct compat_iovec); 893 else 894 arg += sizeof(struct iovec); 895 } 896 897 if (tags) { 898 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 899 ret = -EFAULT; 900 break; 901 } 902 } 903 904 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 905 if (IS_ERR(node)) { 906 ret = PTR_ERR(node); 907 break; 908 } 909 if (tag) { 910 if (!node) { 911 ret = -EINVAL; 912 break; 913 } 914 node->tag = tag; 915 } 916 data.nodes[i] = node; 917 } 918 919 ctx->buf_table = data; 920 if (ret) { 921 io_clear_table_tags(&ctx->buf_table); 922 io_sqe_buffers_unregister(ctx); 923 } 924 return ret; 925 } 926 927 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 928 void (*release)(void *), unsigned int index, 929 unsigned int issue_flags) 930 { 931 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 932 struct io_rsrc_data *data = &ctx->buf_table; 933 struct req_iterator rq_iter; 934 struct io_mapped_ubuf *imu; 935 struct io_rsrc_node *node; 936 struct bio_vec bv; 937 unsigned int nr_bvecs = 0; 938 int ret = 0; 939 940 io_ring_submit_lock(ctx, issue_flags); 941 if (index >= data->nr) { 942 ret = -EINVAL; 943 goto unlock; 944 } 945 index = array_index_nospec(index, data->nr); 946 947 if (data->nodes[index]) { 948 ret = -EBUSY; 949 goto unlock; 950 } 951 952 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 953 if (!node) { 954 ret = -ENOMEM; 955 goto unlock; 956 } 957 958 /* 959 * blk_rq_nr_phys_segments() may overestimate the number of bvecs 960 * but avoids needing to iterate over the bvecs 961 */ 962 imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); 963 if (!imu) { 964 kfree(node); 965 ret = -ENOMEM; 966 goto unlock; 967 } 968 969 imu->ubuf = 0; 970 imu->len = blk_rq_bytes(rq); 971 imu->acct_pages = 0; 972 imu->folio_shift = PAGE_SHIFT; 973 refcount_set(&imu->refs, 1); 974 imu->release = release; 975 imu->priv = rq; 976 imu->flags = IO_REGBUF_F_KBUF; 977 imu->dir = 1 << rq_data_dir(rq); 978 979 rq_for_each_bvec(bv, rq, rq_iter) 980 imu->bvec[nr_bvecs++] = bv; 981 imu->nr_bvecs = nr_bvecs; 982 983 node->buf = imu; 984 data->nodes[index] = node; 985 unlock: 986 io_ring_submit_unlock(ctx, issue_flags); 987 return ret; 988 } 989 EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 990 991 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 992 unsigned int issue_flags) 993 { 994 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 995 struct io_rsrc_data *data = &ctx->buf_table; 996 struct io_rsrc_node *node; 997 int ret = 0; 998 999 io_ring_submit_lock(ctx, issue_flags); 1000 if (index >= data->nr) { 1001 ret = -EINVAL; 1002 goto unlock; 1003 } 1004 index = array_index_nospec(index, data->nr); 1005 1006 node = data->nodes[index]; 1007 if (!node) { 1008 ret = -EINVAL; 1009 goto unlock; 1010 } 1011 if (!(node->buf->flags & IO_REGBUF_F_KBUF)) { 1012 ret = -EBUSY; 1013 goto unlock; 1014 } 1015 1016 io_put_rsrc_node(ctx, node); 1017 data->nodes[index] = NULL; 1018 unlock: 1019 io_ring_submit_unlock(ctx, issue_flags); 1020 return ret; 1021 } 1022 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 1023 1024 static int validate_fixed_range(u64 buf_addr, size_t len, 1025 const struct io_mapped_ubuf *imu) 1026 { 1027 u64 buf_end; 1028 1029 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1030 return -EFAULT; 1031 /* not inside the mapped region */ 1032 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1033 return -EFAULT; 1034 if (unlikely(len > MAX_RW_COUNT)) 1035 return -EFAULT; 1036 return 0; 1037 } 1038 1039 static int io_import_kbuf(int ddir, struct iov_iter *iter, 1040 struct io_mapped_ubuf *imu, size_t len, size_t offset) 1041 { 1042 size_t count = len + offset; 1043 1044 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); 1045 iov_iter_advance(iter, offset); 1046 return 0; 1047 } 1048 1049 static int io_import_fixed(int ddir, struct iov_iter *iter, 1050 struct io_mapped_ubuf *imu, 1051 u64 buf_addr, size_t len) 1052 { 1053 const struct bio_vec *bvec; 1054 size_t folio_mask; 1055 unsigned nr_segs; 1056 size_t offset; 1057 int ret; 1058 1059 ret = validate_fixed_range(buf_addr, len, imu); 1060 if (unlikely(ret)) 1061 return ret; 1062 if (!(imu->dir & (1 << ddir))) 1063 return -EFAULT; 1064 1065 offset = buf_addr - imu->ubuf; 1066 1067 if (imu->flags & IO_REGBUF_F_KBUF) 1068 return io_import_kbuf(ddir, iter, imu, len, offset); 1069 1070 /* 1071 * Don't use iov_iter_advance() here, as it's really slow for 1072 * using the latter parts of a big fixed buffer - it iterates 1073 * over each segment manually. We can cheat a bit here for user 1074 * registered nodes, because we know that: 1075 * 1076 * 1) it's a BVEC iter, we set it up 1077 * 2) all bvecs are the same in size, except potentially the 1078 * first and last bvec 1079 */ 1080 folio_mask = (1UL << imu->folio_shift) - 1; 1081 bvec = imu->bvec; 1082 if (offset >= bvec->bv_len) { 1083 unsigned long seg_skip; 1084 1085 /* skip first vec */ 1086 offset -= bvec->bv_len; 1087 seg_skip = 1 + (offset >> imu->folio_shift); 1088 bvec += seg_skip; 1089 offset &= folio_mask; 1090 } 1091 nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; 1092 iov_iter_bvec(iter, ddir, bvec, nr_segs, len); 1093 iter->iov_offset = offset; 1094 return 0; 1095 } 1096 1097 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 1098 unsigned issue_flags) 1099 { 1100 struct io_ring_ctx *ctx = req->ctx; 1101 struct io_rsrc_node *node; 1102 1103 if (req->flags & REQ_F_BUF_NODE) 1104 return req->buf_node; 1105 req->flags |= REQ_F_BUF_NODE; 1106 1107 io_ring_submit_lock(ctx, issue_flags); 1108 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 1109 if (node) { 1110 node->refs++; 1111 req->buf_node = node; 1112 io_ring_submit_unlock(ctx, issue_flags); 1113 return node; 1114 } 1115 req->flags &= ~REQ_F_BUF_NODE; 1116 io_ring_submit_unlock(ctx, issue_flags); 1117 return NULL; 1118 } 1119 1120 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 1121 u64 buf_addr, size_t len, int ddir, 1122 unsigned issue_flags) 1123 { 1124 struct io_rsrc_node *node; 1125 1126 node = io_find_buf_node(req, issue_flags); 1127 if (!node) 1128 return -EFAULT; 1129 return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1130 } 1131 1132 /* Lock two rings at once. The rings must be different! */ 1133 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 1134 { 1135 if (ctx1 > ctx2) 1136 swap(ctx1, ctx2); 1137 mutex_lock(&ctx1->uring_lock); 1138 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 1139 } 1140 1141 /* Both rings are locked by the caller. */ 1142 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 1143 struct io_uring_clone_buffers *arg) 1144 { 1145 struct io_rsrc_data data; 1146 int i, ret, off, nr; 1147 unsigned int nbufs; 1148 1149 lockdep_assert_held(&ctx->uring_lock); 1150 lockdep_assert_held(&src_ctx->uring_lock); 1151 1152 /* 1153 * Accounting state is shared between the two rings; that only works if 1154 * both rings are accounted towards the same counters. 1155 */ 1156 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 1157 return -EINVAL; 1158 1159 /* if offsets are given, must have nr specified too */ 1160 if (!arg->nr && (arg->dst_off || arg->src_off)) 1161 return -EINVAL; 1162 /* not allowed unless REPLACE is set */ 1163 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 1164 return -EBUSY; 1165 1166 nbufs = src_ctx->buf_table.nr; 1167 if (!nbufs) 1168 return -ENXIO; 1169 if (!arg->nr) 1170 arg->nr = nbufs; 1171 else if (arg->nr > nbufs) 1172 return -EINVAL; 1173 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1174 return -EINVAL; 1175 if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs) 1176 return -EOVERFLOW; 1177 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1178 return -EOVERFLOW; 1179 if (nbufs > IORING_MAX_REG_BUFFERS) 1180 return -EINVAL; 1181 1182 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 1183 if (ret) 1184 return ret; 1185 1186 /* Copy original dst nodes from before the cloned range */ 1187 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1188 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 1189 1190 if (node) { 1191 data.nodes[i] = node; 1192 node->refs++; 1193 } 1194 } 1195 1196 off = arg->dst_off; 1197 i = arg->src_off; 1198 nr = arg->nr; 1199 while (nr--) { 1200 struct io_rsrc_node *dst_node, *src_node; 1201 1202 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1203 if (!src_node) { 1204 dst_node = NULL; 1205 } else { 1206 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1207 if (!dst_node) { 1208 io_rsrc_data_free(ctx, &data); 1209 return -ENOMEM; 1210 } 1211 1212 refcount_inc(&src_node->buf->refs); 1213 dst_node->buf = src_node->buf; 1214 } 1215 data.nodes[off++] = dst_node; 1216 i++; 1217 } 1218 1219 /* Copy original dst nodes from after the cloned range */ 1220 for (i = nbufs; i < ctx->buf_table.nr; i++) { 1221 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 1222 1223 if (node) { 1224 data.nodes[i] = node; 1225 node->refs++; 1226 } 1227 } 1228 1229 /* 1230 * If asked for replace, put the old table. data->nodes[] holds both 1231 * old and new nodes at this point. 1232 */ 1233 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1234 io_rsrc_data_free(ctx, &ctx->buf_table); 1235 1236 /* 1237 * ctx->buf_table must be empty now - either the contents are being 1238 * replaced and we just freed the table, or the contents are being 1239 * copied to a ring that does not have buffers yet (checked at function 1240 * entry). 1241 */ 1242 WARN_ON_ONCE(ctx->buf_table.nr); 1243 ctx->buf_table = data; 1244 return 0; 1245 } 1246 1247 /* 1248 * Copy the registered buffers from the source ring whose file descriptor 1249 * is given in the src_fd to the current ring. This is identical to registering 1250 * the buffers with ctx, except faster as mappings already exist. 1251 * 1252 * Since the memory is already accounted once, don't account it again. 1253 */ 1254 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1255 { 1256 struct io_uring_clone_buffers buf; 1257 struct io_ring_ctx *src_ctx; 1258 bool registered_src; 1259 struct file *file; 1260 int ret; 1261 1262 if (copy_from_user(&buf, arg, sizeof(buf))) 1263 return -EFAULT; 1264 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1265 return -EINVAL; 1266 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1267 return -EBUSY; 1268 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1269 return -EINVAL; 1270 1271 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1272 file = io_uring_register_get_file(buf.src_fd, registered_src); 1273 if (IS_ERR(file)) 1274 return PTR_ERR(file); 1275 1276 src_ctx = file->private_data; 1277 if (src_ctx != ctx) { 1278 mutex_unlock(&ctx->uring_lock); 1279 lock_two_rings(ctx, src_ctx); 1280 1281 if (src_ctx->submitter_task && 1282 src_ctx->submitter_task != current) { 1283 ret = -EEXIST; 1284 goto out; 1285 } 1286 } 1287 1288 ret = io_clone_buffers(ctx, src_ctx, &buf); 1289 1290 out: 1291 if (src_ctx != ctx) 1292 mutex_unlock(&src_ctx->uring_lock); 1293 1294 fput(file); 1295 return ret; 1296 } 1297 1298 void io_vec_free(struct iou_vec *iv) 1299 { 1300 if (!iv->iovec) 1301 return; 1302 kfree(iv->iovec); 1303 iv->iovec = NULL; 1304 iv->nr = 0; 1305 } 1306 1307 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1308 { 1309 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_NOWARN; 1310 struct iovec *iov; 1311 1312 iov = kmalloc_objs(iov[0], nr_entries, gfp); 1313 if (!iov) 1314 return -ENOMEM; 1315 1316 io_vec_free(iv); 1317 iv->iovec = iov; 1318 iv->nr = nr_entries; 1319 return 0; 1320 } 1321 1322 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, 1323 struct io_mapped_ubuf *imu, 1324 struct iovec *iovec, unsigned nr_iovs, 1325 struct iou_vec *vec) 1326 { 1327 unsigned long folio_size = 1 << imu->folio_shift; 1328 unsigned long folio_mask = folio_size - 1; 1329 struct bio_vec *res_bvec = vec->bvec; 1330 size_t total_len = 0; 1331 unsigned bvec_idx = 0; 1332 unsigned iov_idx; 1333 1334 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1335 size_t iov_len = iovec[iov_idx].iov_len; 1336 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; 1337 struct bio_vec *src_bvec; 1338 size_t offset; 1339 int ret; 1340 1341 ret = validate_fixed_range(buf_addr, iov_len, imu); 1342 if (unlikely(ret)) 1343 return ret; 1344 1345 if (unlikely(!iov_len)) 1346 return -EFAULT; 1347 if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) 1348 return -EOVERFLOW; 1349 1350 offset = buf_addr - imu->ubuf; 1351 /* 1352 * Only the first bvec can have non zero bv_offset, account it 1353 * here and work with full folios below. 1354 */ 1355 offset += imu->bvec[0].bv_offset; 1356 1357 src_bvec = imu->bvec + (offset >> imu->folio_shift); 1358 offset &= folio_mask; 1359 1360 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { 1361 size_t seg_size = min_t(size_t, iov_len, 1362 folio_size - offset); 1363 1364 bvec_set_page(&res_bvec[bvec_idx], 1365 src_bvec->bv_page, seg_size, offset); 1366 iov_len -= seg_size; 1367 } 1368 } 1369 if (total_len > MAX_RW_COUNT) 1370 return -EINVAL; 1371 1372 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); 1373 return 0; 1374 } 1375 1376 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, 1377 struct io_mapped_ubuf *imu) 1378 { 1379 unsigned shift = imu->folio_shift; 1380 size_t max_segs = 0; 1381 unsigned i; 1382 1383 for (i = 0; i < nr_iovs; i++) { 1384 max_segs += (iov[i].iov_len >> shift) + 2; 1385 if (max_segs > INT_MAX) 1386 return -EOVERFLOW; 1387 } 1388 return max_segs; 1389 } 1390 1391 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, 1392 struct io_mapped_ubuf *imu, 1393 struct iovec *iovec, unsigned nr_iovs, 1394 struct iou_vec *vec) 1395 { 1396 const struct bio_vec *src_bvec = imu->bvec; 1397 struct bio_vec *res_bvec = vec->bvec; 1398 unsigned res_idx = 0; 1399 size_t total_len = 0; 1400 unsigned iov_idx; 1401 1402 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1403 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; 1404 size_t iov_len = iovec[iov_idx].iov_len; 1405 struct bvec_iter bi = { 1406 .bi_size = offset + iov_len, 1407 }; 1408 struct bio_vec bv; 1409 1410 bvec_iter_advance(src_bvec, &bi, offset); 1411 for_each_mp_bvec(bv, src_bvec, bi, bi) 1412 res_bvec[res_idx++] = bv; 1413 total_len += iov_len; 1414 } 1415 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); 1416 return 0; 1417 } 1418 1419 static int iov_kern_bvec_size(const struct iovec *iov, 1420 const struct io_mapped_ubuf *imu, 1421 unsigned int *nr_seg) 1422 { 1423 size_t offset = (size_t)(uintptr_t)iov->iov_base; 1424 const struct bio_vec *bvec = imu->bvec; 1425 int start = 0, i = 0; 1426 size_t off = 0; 1427 int ret; 1428 1429 ret = validate_fixed_range(offset, iov->iov_len, imu); 1430 if (unlikely(ret)) 1431 return ret; 1432 1433 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; 1434 off += bvec[i].bv_len, i++) { 1435 if (offset >= off && offset < off + bvec[i].bv_len) 1436 start = i; 1437 } 1438 *nr_seg = i - start; 1439 return 0; 1440 } 1441 1442 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, 1443 struct io_mapped_ubuf *imu, unsigned *nr_segs) 1444 { 1445 unsigned max_segs = 0; 1446 size_t total_len = 0; 1447 unsigned i; 1448 int ret; 1449 1450 *nr_segs = 0; 1451 for (i = 0; i < nr_iovs; i++) { 1452 if (unlikely(!iov[i].iov_len)) 1453 return -EFAULT; 1454 if (unlikely(check_add_overflow(total_len, iov[i].iov_len, 1455 &total_len))) 1456 return -EOVERFLOW; 1457 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); 1458 if (unlikely(ret)) 1459 return ret; 1460 *nr_segs += max_segs; 1461 } 1462 if (total_len > MAX_RW_COUNT) 1463 return -EINVAL; 1464 return 0; 1465 } 1466 1467 int io_import_reg_vec(int ddir, struct iov_iter *iter, 1468 struct io_kiocb *req, struct iou_vec *vec, 1469 unsigned nr_iovs, unsigned issue_flags) 1470 { 1471 struct io_rsrc_node *node; 1472 struct io_mapped_ubuf *imu; 1473 unsigned iovec_off; 1474 struct iovec *iov; 1475 unsigned nr_segs; 1476 1477 node = io_find_buf_node(req, issue_flags); 1478 if (!node) 1479 return -EFAULT; 1480 imu = node->buf; 1481 if (!(imu->dir & (1 << ddir))) 1482 return -EFAULT; 1483 1484 iovec_off = vec->nr - nr_iovs; 1485 iov = vec->iovec + iovec_off; 1486 1487 if (imu->flags & IO_REGBUF_F_KBUF) { 1488 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); 1489 1490 if (unlikely(ret)) 1491 return ret; 1492 } else { 1493 int ret = io_estimate_bvec_size(iov, nr_iovs, imu); 1494 1495 if (ret < 0) 1496 return ret; 1497 nr_segs = ret; 1498 } 1499 1500 if (sizeof(struct bio_vec) > sizeof(struct iovec)) { 1501 size_t bvec_bytes; 1502 1503 bvec_bytes = nr_segs * sizeof(struct bio_vec); 1504 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); 1505 nr_segs += nr_iovs; 1506 } 1507 1508 if (nr_segs > vec->nr) { 1509 struct iou_vec tmp_vec = {}; 1510 int ret; 1511 1512 ret = io_vec_realloc(&tmp_vec, nr_segs); 1513 if (ret) 1514 return ret; 1515 1516 iovec_off = tmp_vec.nr - nr_iovs; 1517 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); 1518 io_vec_free(vec); 1519 1520 *vec = tmp_vec; 1521 iov = vec->iovec + iovec_off; 1522 req->flags |= REQ_F_NEED_CLEANUP; 1523 } 1524 1525 if (imu->flags & IO_REGBUF_F_KBUF) 1526 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1527 1528 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1529 } 1530 1531 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, 1532 const struct iovec __user *uvec, size_t uvec_segs) 1533 { 1534 struct iovec *iov; 1535 int iovec_off, ret; 1536 void *res; 1537 1538 if (uvec_segs > iv->nr) { 1539 ret = io_vec_realloc(iv, uvec_segs); 1540 if (ret) 1541 return ret; 1542 req->flags |= REQ_F_NEED_CLEANUP; 1543 } 1544 1545 /* pad iovec to the right */ 1546 iovec_off = iv->nr - uvec_segs; 1547 iov = iv->iovec + iovec_off; 1548 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, 1549 io_is_compat(req->ctx)); 1550 if (IS_ERR(res)) 1551 return PTR_ERR(res); 1552 1553 req->flags |= REQ_F_IMPORT_BUFFER; 1554 return 0; 1555 } 1556