1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 #include <linux/io_uring/cmd.h> 13 14 #include <uapi/linux/io_uring.h> 15 16 #include "io_uring.h" 17 #include "openclose.h" 18 #include "rsrc.h" 19 #include "memmap.h" 20 #include "register.h" 21 22 struct io_rsrc_update { 23 struct file *file; 24 u64 arg; 25 u32 nr_args; 26 u32 offset; 27 }; 28 29 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 30 struct iovec *iov, struct page **last_hpage); 31 32 /* only define max */ 33 #define IORING_MAX_FIXED_FILES (1U << 20) 34 #define IORING_MAX_REG_BUFFERS (1U << 14) 35 36 #define IO_CACHED_BVECS_SEGS 32 37 38 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 39 { 40 unsigned long page_limit, cur_pages, new_pages; 41 42 if (!nr_pages) 43 return 0; 44 45 /* Don't allow more pages than we can safely lock */ 46 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 47 48 cur_pages = atomic_long_read(&user->locked_vm); 49 do { 50 new_pages = cur_pages + nr_pages; 51 if (new_pages > page_limit) 52 return -ENOMEM; 53 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 54 &cur_pages, new_pages)); 55 return 0; 56 } 57 58 void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 59 { 60 if (ctx->user) 61 __io_unaccount_mem(ctx->user, nr_pages); 62 63 if (ctx->mm_account) 64 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 65 } 66 67 int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 68 { 69 int ret; 70 71 if (ctx->user) { 72 ret = __io_account_mem(ctx->user, nr_pages); 73 if (ret) 74 return ret; 75 } 76 77 if (ctx->mm_account) 78 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 79 80 return 0; 81 } 82 83 int io_validate_user_buf_range(u64 uaddr, u64 ulen) 84 { 85 unsigned long tmp, base = (unsigned long)uaddr; 86 unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); 87 88 /* arbitrary limit, but we need something */ 89 if (ulen > SZ_1G || !ulen) 90 return -EFAULT; 91 if (check_add_overflow(base, acct_len, &tmp)) 92 return -EOVERFLOW; 93 return 0; 94 } 95 96 static int io_buffer_validate(struct iovec *iov) 97 { 98 /* 99 * Don't impose further limits on the size and buffer 100 * constraints here, we'll -EINVAL later when IO is 101 * submitted if they are wrong. 102 */ 103 if (!iov->iov_base) 104 return iov->iov_len ? -EFAULT : 0; 105 106 return io_validate_user_buf_range((unsigned long)iov->iov_base, 107 iov->iov_len); 108 } 109 110 static void io_release_ubuf(void *priv) 111 { 112 struct io_mapped_ubuf *imu = priv; 113 unsigned int i; 114 115 for (i = 0; i < imu->nr_bvecs; i++) { 116 struct folio *folio = page_folio(imu->bvec[i].bv_page); 117 118 unpin_user_folio(folio, 1); 119 } 120 } 121 122 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 123 int nr_bvecs) 124 { 125 if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 126 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 127 return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), 128 GFP_KERNEL); 129 } 130 131 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 132 { 133 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 134 io_cache_free(&ctx->imu_cache, imu); 135 else 136 kvfree(imu); 137 } 138 139 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 140 { 141 if (unlikely(refcount_read(&imu->refs) > 1)) { 142 if (!refcount_dec_and_test(&imu->refs)) 143 return; 144 } 145 146 if (imu->acct_pages) 147 io_unaccount_mem(ctx, imu->acct_pages); 148 imu->release(imu->priv); 149 io_free_imu(ctx, imu); 150 } 151 152 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 153 { 154 struct io_rsrc_node *node; 155 156 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 157 if (node) { 158 node->type = type; 159 node->refs = 1; 160 node->tag = 0; 161 node->file_ptr = 0; 162 } 163 return node; 164 } 165 166 bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 167 { 168 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 169 IO_CACHED_BVECS_SEGS); 170 const int node_size = sizeof(struct io_rsrc_node); 171 bool ret; 172 173 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 174 node_size, 0); 175 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 176 imu_cache_size, 0); 177 return ret; 178 } 179 180 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 181 { 182 io_alloc_cache_free(&ctx->node_cache, kfree); 183 io_alloc_cache_free(&ctx->imu_cache, kfree); 184 } 185 186 static void io_clear_table_tags(struct io_rsrc_data *data) 187 { 188 int i; 189 190 for (i = 0; i < data->nr; i++) { 191 struct io_rsrc_node *node = data->nodes[i]; 192 193 if (node) 194 node->tag = 0; 195 } 196 } 197 198 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 199 struct io_rsrc_data *data) 200 { 201 if (!data->nr) 202 return; 203 while (data->nr--) { 204 if (data->nodes[data->nr]) 205 io_put_rsrc_node(ctx, data->nodes[data->nr]); 206 } 207 kvfree(data->nodes); 208 data->nodes = NULL; 209 data->nr = 0; 210 } 211 212 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 213 { 214 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), 215 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 216 if (data->nodes) { 217 data->nr = nr; 218 return 0; 219 } 220 return -ENOMEM; 221 } 222 223 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 224 struct io_uring_rsrc_update2 *up, 225 unsigned nr_args) 226 { 227 u64 __user *tags = u64_to_user_ptr(up->tags); 228 __s32 __user *fds = u64_to_user_ptr(up->data); 229 int fd, i, err = 0; 230 unsigned int done; 231 232 if (!ctx->file_table.data.nr) 233 return -ENXIO; 234 if (up->offset + nr_args > ctx->file_table.data.nr) 235 return -EINVAL; 236 237 for (done = 0; done < nr_args; done++) { 238 u64 tag = 0; 239 240 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 241 copy_from_user(&fd, &fds[done], sizeof(fd))) { 242 err = -EFAULT; 243 break; 244 } 245 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 246 err = -EINVAL; 247 break; 248 } 249 if (fd == IORING_REGISTER_FILES_SKIP) 250 continue; 251 252 i = up->offset + done; 253 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 254 io_file_bitmap_clear(&ctx->file_table, i); 255 256 if (fd != -1) { 257 struct file *file = fget(fd); 258 struct io_rsrc_node *node; 259 260 if (!file) { 261 err = -EBADF; 262 break; 263 } 264 /* 265 * Don't allow io_uring instances to be registered. 266 */ 267 if (io_is_uring_fops(file)) { 268 fput(file); 269 err = -EBADF; 270 break; 271 } 272 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 273 if (!node) { 274 err = -ENOMEM; 275 fput(file); 276 break; 277 } 278 ctx->file_table.data.nodes[i] = node; 279 if (tag) 280 node->tag = tag; 281 io_fixed_file_set(node, file); 282 io_file_bitmap_set(&ctx->file_table, i); 283 } 284 } 285 return done ? done : err; 286 } 287 288 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 289 struct io_uring_rsrc_update2 *up, 290 unsigned int nr_args) 291 { 292 u64 __user *tags = u64_to_user_ptr(up->tags); 293 struct iovec fast_iov, *iov; 294 struct page *last_hpage = NULL; 295 struct iovec __user *uvec; 296 u64 user_data = up->data; 297 __u32 done; 298 int i, err; 299 300 if (!ctx->buf_table.nr) 301 return -ENXIO; 302 if (up->offset + nr_args > ctx->buf_table.nr) 303 return -EINVAL; 304 305 for (done = 0; done < nr_args; done++) { 306 struct io_rsrc_node *node; 307 u64 tag = 0; 308 309 uvec = u64_to_user_ptr(user_data); 310 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 311 if (IS_ERR(iov)) { 312 err = PTR_ERR(iov); 313 break; 314 } 315 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 316 err = -EFAULT; 317 break; 318 } 319 err = io_buffer_validate(iov); 320 if (err) 321 break; 322 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 323 if (IS_ERR(node)) { 324 err = PTR_ERR(node); 325 break; 326 } 327 if (tag) { 328 if (!node) { 329 err = -EINVAL; 330 break; 331 } 332 node->tag = tag; 333 } 334 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 335 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 336 ctx->buf_table.nodes[i] = node; 337 if (ctx->compat) 338 user_data += sizeof(struct compat_iovec); 339 else 340 user_data += sizeof(struct iovec); 341 } 342 return done ? done : err; 343 } 344 345 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 346 struct io_uring_rsrc_update2 *up, 347 unsigned nr_args) 348 { 349 __u32 tmp; 350 351 lockdep_assert_held(&ctx->uring_lock); 352 353 if (check_add_overflow(up->offset, nr_args, &tmp)) 354 return -EOVERFLOW; 355 356 switch (type) { 357 case IORING_RSRC_FILE: 358 return __io_sqe_files_update(ctx, up, nr_args); 359 case IORING_RSRC_BUFFER: 360 return __io_sqe_buffers_update(ctx, up, nr_args); 361 } 362 return -EINVAL; 363 } 364 365 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 366 unsigned nr_args) 367 { 368 struct io_uring_rsrc_update2 up; 369 370 if (!nr_args) 371 return -EINVAL; 372 memset(&up, 0, sizeof(up)); 373 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 374 return -EFAULT; 375 if (up.resv || up.resv2) 376 return -EINVAL; 377 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 378 } 379 380 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 381 unsigned size, unsigned type) 382 { 383 struct io_uring_rsrc_update2 up; 384 385 if (size != sizeof(up)) 386 return -EINVAL; 387 if (copy_from_user(&up, arg, sizeof(up))) 388 return -EFAULT; 389 if (!up.nr || up.resv || up.resv2) 390 return -EINVAL; 391 return __io_register_rsrc_update(ctx, type, &up, up.nr); 392 } 393 394 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 395 unsigned int size, unsigned int type) 396 { 397 struct io_uring_rsrc_register rr; 398 399 /* keep it extendible */ 400 if (size != sizeof(rr)) 401 return -EINVAL; 402 403 memset(&rr, 0, sizeof(rr)); 404 if (copy_from_user(&rr, arg, size)) 405 return -EFAULT; 406 if (!rr.nr || rr.resv2) 407 return -EINVAL; 408 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 409 return -EINVAL; 410 411 switch (type) { 412 case IORING_RSRC_FILE: 413 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 414 break; 415 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 416 rr.nr, u64_to_user_ptr(rr.tags)); 417 case IORING_RSRC_BUFFER: 418 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 419 break; 420 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 421 rr.nr, u64_to_user_ptr(rr.tags)); 422 } 423 return -EINVAL; 424 } 425 426 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 427 { 428 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 429 430 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 431 return -EINVAL; 432 if (sqe->rw_flags || sqe->splice_fd_in) 433 return -EINVAL; 434 435 up->offset = READ_ONCE(sqe->off); 436 up->nr_args = READ_ONCE(sqe->len); 437 if (!up->nr_args) 438 return -EINVAL; 439 up->arg = READ_ONCE(sqe->addr); 440 return 0; 441 } 442 443 static int io_files_update_with_index_alloc(struct io_kiocb *req, 444 unsigned int issue_flags) 445 { 446 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 447 __s32 __user *fds = u64_to_user_ptr(up->arg); 448 unsigned int done; 449 struct file *file; 450 int ret, fd; 451 452 if (!req->ctx->file_table.data.nr) 453 return -ENXIO; 454 455 for (done = 0; done < up->nr_args; done++) { 456 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 457 ret = -EFAULT; 458 break; 459 } 460 461 file = fget(fd); 462 if (!file) { 463 ret = -EBADF; 464 break; 465 } 466 ret = io_fixed_fd_install(req, issue_flags, file, 467 IORING_FILE_INDEX_ALLOC); 468 if (ret < 0) 469 break; 470 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 471 __io_close_fixed(req->ctx, issue_flags, ret); 472 ret = -EFAULT; 473 break; 474 } 475 } 476 477 if (done) 478 return done; 479 return ret; 480 } 481 482 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 483 { 484 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 485 struct io_ring_ctx *ctx = req->ctx; 486 struct io_uring_rsrc_update2 up2; 487 int ret; 488 489 up2.offset = up->offset; 490 up2.data = up->arg; 491 up2.nr = 0; 492 up2.tags = 0; 493 up2.resv = 0; 494 up2.resv2 = 0; 495 496 if (up->offset == IORING_FILE_INDEX_ALLOC) { 497 ret = io_files_update_with_index_alloc(req, issue_flags); 498 } else { 499 io_ring_submit_lock(ctx, issue_flags); 500 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 501 &up2, up->nr_args); 502 io_ring_submit_unlock(ctx, issue_flags); 503 } 504 505 if (ret < 0) 506 req_set_fail(req); 507 io_req_set_res(req, ret, 0); 508 return IOU_COMPLETE; 509 } 510 511 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 512 { 513 if (node->tag) 514 io_post_aux_cqe(ctx, node->tag, 0, 0); 515 516 switch (node->type) { 517 case IORING_RSRC_FILE: 518 fput(io_slot_file(node)); 519 break; 520 case IORING_RSRC_BUFFER: 521 io_buffer_unmap(ctx, node->buf); 522 break; 523 default: 524 WARN_ON_ONCE(1); 525 break; 526 } 527 528 io_cache_free(&ctx->node_cache, node); 529 } 530 531 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 532 { 533 if (!ctx->file_table.data.nr) 534 return -ENXIO; 535 536 io_free_file_tables(ctx, &ctx->file_table); 537 io_file_table_set_alloc_range(ctx, 0, 0); 538 return 0; 539 } 540 541 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 542 unsigned nr_args, u64 __user *tags) 543 { 544 __s32 __user *fds = (__s32 __user *) arg; 545 struct file *file; 546 int fd, ret; 547 unsigned i; 548 549 if (ctx->file_table.data.nr) 550 return -EBUSY; 551 if (!nr_args) 552 return -EINVAL; 553 if (nr_args > IORING_MAX_FIXED_FILES) 554 return -EMFILE; 555 if (nr_args > rlimit(RLIMIT_NOFILE)) 556 return -EMFILE; 557 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 558 return -ENOMEM; 559 560 for (i = 0; i < nr_args; i++) { 561 struct io_rsrc_node *node; 562 u64 tag = 0; 563 564 ret = -EFAULT; 565 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 566 goto fail; 567 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 568 goto fail; 569 /* allow sparse sets */ 570 if (!fds || fd == -1) { 571 ret = -EINVAL; 572 if (tag) 573 goto fail; 574 continue; 575 } 576 577 file = fget(fd); 578 ret = -EBADF; 579 if (unlikely(!file)) 580 goto fail; 581 582 /* 583 * Don't allow io_uring instances to be registered. 584 */ 585 if (io_is_uring_fops(file)) { 586 fput(file); 587 goto fail; 588 } 589 ret = -ENOMEM; 590 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 591 if (!node) { 592 fput(file); 593 goto fail; 594 } 595 if (tag) 596 node->tag = tag; 597 ctx->file_table.data.nodes[i] = node; 598 io_fixed_file_set(node, file); 599 io_file_bitmap_set(&ctx->file_table, i); 600 } 601 602 /* default it to the whole table */ 603 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 604 return 0; 605 fail: 606 io_clear_table_tags(&ctx->file_table.data); 607 io_sqe_files_unregister(ctx); 608 return ret; 609 } 610 611 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 612 { 613 if (!ctx->buf_table.nr) 614 return -ENXIO; 615 io_rsrc_data_free(ctx, &ctx->buf_table); 616 return 0; 617 } 618 619 /* 620 * Not super efficient, but this is just a registration time. And we do cache 621 * the last compound head, so generally we'll only do a full search if we don't 622 * match that one. 623 * 624 * We check if the given compound head page has already been accounted, to 625 * avoid double accounting it. This allows us to account the full size of the 626 * page, not just the constituent pages of a huge page. 627 */ 628 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 629 int nr_pages, struct page *hpage) 630 { 631 int i, j; 632 633 /* check current page array */ 634 for (i = 0; i < nr_pages; i++) { 635 if (!PageCompound(pages[i])) 636 continue; 637 if (compound_head(pages[i]) == hpage) 638 return true; 639 } 640 641 /* check previously registered pages */ 642 for (i = 0; i < ctx->buf_table.nr; i++) { 643 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 644 struct io_mapped_ubuf *imu; 645 646 if (!node) 647 continue; 648 imu = node->buf; 649 for (j = 0; j < imu->nr_bvecs; j++) { 650 if (!PageCompound(imu->bvec[j].bv_page)) 651 continue; 652 if (compound_head(imu->bvec[j].bv_page) == hpage) 653 return true; 654 } 655 } 656 657 return false; 658 } 659 660 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 661 int nr_pages, struct io_mapped_ubuf *imu, 662 struct page **last_hpage) 663 { 664 int i, ret; 665 666 imu->acct_pages = 0; 667 for (i = 0; i < nr_pages; i++) { 668 if (!PageCompound(pages[i])) { 669 imu->acct_pages++; 670 } else { 671 struct page *hpage; 672 673 hpage = compound_head(pages[i]); 674 if (hpage == *last_hpage) 675 continue; 676 *last_hpage = hpage; 677 if (headpage_already_acct(ctx, pages, i, hpage)) 678 continue; 679 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 680 } 681 } 682 683 if (!imu->acct_pages) 684 return 0; 685 686 ret = io_account_mem(ctx, imu->acct_pages); 687 if (ret) 688 imu->acct_pages = 0; 689 return ret; 690 } 691 692 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 693 struct io_imu_folio_data *data) 694 { 695 struct page **page_array = *pages, **new_array = NULL; 696 unsigned nr_pages_left = *nr_pages; 697 unsigned nr_folios = data->nr_folios; 698 unsigned i, j; 699 700 /* Store head pages only*/ 701 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); 702 if (!new_array) 703 return false; 704 705 for (i = 0, j = 0; i < nr_folios; i++) { 706 struct page *p = compound_head(page_array[j]); 707 struct folio *folio = page_folio(p); 708 unsigned int nr; 709 710 WARN_ON_ONCE(i > 0 && p != page_array[j]); 711 712 nr = i ? data->nr_pages_mid : data->nr_pages_head; 713 nr = min(nr, nr_pages_left); 714 /* Drop all but one ref, the entire folio will remain pinned. */ 715 if (nr > 1) 716 unpin_user_folio(folio, nr - 1); 717 j += nr; 718 nr_pages_left -= nr; 719 new_array[i] = p; 720 } 721 722 WARN_ON_ONCE(j != *nr_pages); 723 724 kvfree(page_array); 725 *pages = new_array; 726 *nr_pages = nr_folios; 727 return true; 728 } 729 730 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 731 struct io_imu_folio_data *data) 732 { 733 struct folio *folio = page_folio(page_array[0]); 734 unsigned int count = 1, nr_folios = 1; 735 int i; 736 737 data->nr_pages_mid = folio_nr_pages(folio); 738 data->folio_shift = folio_shift(folio); 739 data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); 740 741 /* 742 * Check if pages are contiguous inside a folio, and all folios have 743 * the same page count except for the head and tail. 744 */ 745 for (i = 1; i < nr_pages; i++) { 746 if (page_folio(page_array[i]) == folio && 747 page_array[i] == page_array[i-1] + 1) { 748 count++; 749 continue; 750 } 751 752 if (nr_folios == 1) { 753 if (folio_page_idx(folio, page_array[i-1]) != 754 data->nr_pages_mid - 1) 755 return false; 756 757 data->nr_pages_head = count; 758 } else if (count != data->nr_pages_mid) { 759 return false; 760 } 761 762 folio = page_folio(page_array[i]); 763 if (folio_size(folio) != (1UL << data->folio_shift) || 764 folio_page_idx(folio, page_array[i]) != 0) 765 return false; 766 767 count = 1; 768 nr_folios++; 769 } 770 if (nr_folios == 1) 771 data->nr_pages_head = count; 772 773 data->nr_folios = nr_folios; 774 return true; 775 } 776 777 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 778 struct iovec *iov, 779 struct page **last_hpage) 780 { 781 struct io_mapped_ubuf *imu = NULL; 782 struct page **pages = NULL; 783 struct io_rsrc_node *node; 784 unsigned long off; 785 size_t size; 786 int ret, nr_pages, i; 787 struct io_imu_folio_data data; 788 bool coalesced = false; 789 790 if (!iov->iov_base) 791 return NULL; 792 793 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 794 if (!node) 795 return ERR_PTR(-ENOMEM); 796 797 ret = -ENOMEM; 798 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 799 &nr_pages); 800 if (IS_ERR(pages)) { 801 ret = PTR_ERR(pages); 802 pages = NULL; 803 goto done; 804 } 805 806 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 807 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 808 if (data.nr_pages_mid != 1) 809 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 810 } 811 812 imu = io_alloc_imu(ctx, nr_pages); 813 if (!imu) 814 goto done; 815 816 imu->nr_bvecs = nr_pages; 817 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 818 if (ret) 819 goto done; 820 821 size = iov->iov_len; 822 /* store original address for later verification */ 823 imu->ubuf = (unsigned long) iov->iov_base; 824 imu->len = iov->iov_len; 825 imu->folio_shift = PAGE_SHIFT; 826 imu->release = io_release_ubuf; 827 imu->priv = imu; 828 imu->is_kbuf = false; 829 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 830 if (coalesced) 831 imu->folio_shift = data.folio_shift; 832 refcount_set(&imu->refs, 1); 833 834 off = (unsigned long)iov->iov_base & ~PAGE_MASK; 835 if (coalesced) 836 off += data.first_folio_page_idx << PAGE_SHIFT; 837 838 node->buf = imu; 839 ret = 0; 840 841 for (i = 0; i < nr_pages; i++) { 842 size_t vec_len; 843 844 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 845 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 846 off = 0; 847 size -= vec_len; 848 } 849 done: 850 if (ret) { 851 if (imu) 852 io_free_imu(ctx, imu); 853 if (pages) { 854 for (i = 0; i < nr_pages; i++) 855 unpin_user_folio(page_folio(pages[i]), 1); 856 } 857 io_cache_free(&ctx->node_cache, node); 858 node = ERR_PTR(ret); 859 } 860 kvfree(pages); 861 return node; 862 } 863 864 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 865 unsigned int nr_args, u64 __user *tags) 866 { 867 struct page *last_hpage = NULL; 868 struct io_rsrc_data data; 869 struct iovec fast_iov, *iov = &fast_iov; 870 const struct iovec __user *uvec; 871 int i, ret; 872 873 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 874 875 if (ctx->buf_table.nr) 876 return -EBUSY; 877 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 878 return -EINVAL; 879 ret = io_rsrc_data_alloc(&data, nr_args); 880 if (ret) 881 return ret; 882 883 if (!arg) 884 memset(iov, 0, sizeof(*iov)); 885 886 for (i = 0; i < nr_args; i++) { 887 struct io_rsrc_node *node; 888 u64 tag = 0; 889 890 if (arg) { 891 uvec = (struct iovec __user *) arg; 892 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 893 if (IS_ERR(iov)) { 894 ret = PTR_ERR(iov); 895 break; 896 } 897 ret = io_buffer_validate(iov); 898 if (ret) 899 break; 900 if (ctx->compat) 901 arg += sizeof(struct compat_iovec); 902 else 903 arg += sizeof(struct iovec); 904 } 905 906 if (tags) { 907 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 908 ret = -EFAULT; 909 break; 910 } 911 } 912 913 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 914 if (IS_ERR(node)) { 915 ret = PTR_ERR(node); 916 break; 917 } 918 if (tag) { 919 if (!node) { 920 ret = -EINVAL; 921 break; 922 } 923 node->tag = tag; 924 } 925 data.nodes[i] = node; 926 } 927 928 ctx->buf_table = data; 929 if (ret) { 930 io_clear_table_tags(&ctx->buf_table); 931 io_sqe_buffers_unregister(ctx); 932 } 933 return ret; 934 } 935 936 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 937 void (*release)(void *), unsigned int index, 938 unsigned int issue_flags) 939 { 940 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 941 struct io_rsrc_data *data = &ctx->buf_table; 942 struct req_iterator rq_iter; 943 struct io_mapped_ubuf *imu; 944 struct io_rsrc_node *node; 945 struct bio_vec bv, *bvec; 946 u16 nr_bvecs; 947 int ret = 0; 948 949 io_ring_submit_lock(ctx, issue_flags); 950 if (index >= data->nr) { 951 ret = -EINVAL; 952 goto unlock; 953 } 954 index = array_index_nospec(index, data->nr); 955 956 if (data->nodes[index]) { 957 ret = -EBUSY; 958 goto unlock; 959 } 960 961 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 962 if (!node) { 963 ret = -ENOMEM; 964 goto unlock; 965 } 966 967 nr_bvecs = blk_rq_nr_phys_segments(rq); 968 imu = io_alloc_imu(ctx, nr_bvecs); 969 if (!imu) { 970 kfree(node); 971 ret = -ENOMEM; 972 goto unlock; 973 } 974 975 imu->ubuf = 0; 976 imu->len = blk_rq_bytes(rq); 977 imu->acct_pages = 0; 978 imu->folio_shift = PAGE_SHIFT; 979 imu->nr_bvecs = nr_bvecs; 980 refcount_set(&imu->refs, 1); 981 imu->release = release; 982 imu->priv = rq; 983 imu->is_kbuf = true; 984 imu->dir = 1 << rq_data_dir(rq); 985 986 bvec = imu->bvec; 987 rq_for_each_bvec(bv, rq, rq_iter) 988 *bvec++ = bv; 989 990 node->buf = imu; 991 data->nodes[index] = node; 992 unlock: 993 io_ring_submit_unlock(ctx, issue_flags); 994 return ret; 995 } 996 EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 997 998 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 999 unsigned int issue_flags) 1000 { 1001 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 1002 struct io_rsrc_data *data = &ctx->buf_table; 1003 struct io_rsrc_node *node; 1004 int ret = 0; 1005 1006 io_ring_submit_lock(ctx, issue_flags); 1007 if (index >= data->nr) { 1008 ret = -EINVAL; 1009 goto unlock; 1010 } 1011 index = array_index_nospec(index, data->nr); 1012 1013 node = data->nodes[index]; 1014 if (!node) { 1015 ret = -EINVAL; 1016 goto unlock; 1017 } 1018 if (!node->buf->is_kbuf) { 1019 ret = -EBUSY; 1020 goto unlock; 1021 } 1022 1023 io_put_rsrc_node(ctx, node); 1024 data->nodes[index] = NULL; 1025 unlock: 1026 io_ring_submit_unlock(ctx, issue_flags); 1027 return ret; 1028 } 1029 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 1030 1031 static int validate_fixed_range(u64 buf_addr, size_t len, 1032 const struct io_mapped_ubuf *imu) 1033 { 1034 u64 buf_end; 1035 1036 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1037 return -EFAULT; 1038 /* not inside the mapped region */ 1039 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1040 return -EFAULT; 1041 if (unlikely(len > MAX_RW_COUNT)) 1042 return -EFAULT; 1043 return 0; 1044 } 1045 1046 static int io_import_kbuf(int ddir, struct iov_iter *iter, 1047 struct io_mapped_ubuf *imu, size_t len, size_t offset) 1048 { 1049 size_t count = len + offset; 1050 1051 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); 1052 iov_iter_advance(iter, offset); 1053 1054 if (count < imu->len) { 1055 const struct bio_vec *bvec = iter->bvec; 1056 1057 while (len > bvec->bv_len) { 1058 len -= bvec->bv_len; 1059 bvec++; 1060 } 1061 iter->nr_segs = 1 + bvec - iter->bvec; 1062 } 1063 return 0; 1064 } 1065 1066 static int io_import_fixed(int ddir, struct iov_iter *iter, 1067 struct io_mapped_ubuf *imu, 1068 u64 buf_addr, size_t len) 1069 { 1070 const struct bio_vec *bvec; 1071 size_t folio_mask; 1072 unsigned nr_segs; 1073 size_t offset; 1074 int ret; 1075 1076 ret = validate_fixed_range(buf_addr, len, imu); 1077 if (unlikely(ret)) 1078 return ret; 1079 if (!(imu->dir & (1 << ddir))) 1080 return -EFAULT; 1081 1082 offset = buf_addr - imu->ubuf; 1083 1084 if (imu->is_kbuf) 1085 return io_import_kbuf(ddir, iter, imu, len, offset); 1086 1087 /* 1088 * Don't use iov_iter_advance() here, as it's really slow for 1089 * using the latter parts of a big fixed buffer - it iterates 1090 * over each segment manually. We can cheat a bit here for user 1091 * registered nodes, because we know that: 1092 * 1093 * 1) it's a BVEC iter, we set it up 1094 * 2) all bvecs are the same in size, except potentially the 1095 * first and last bvec 1096 */ 1097 folio_mask = (1UL << imu->folio_shift) - 1; 1098 bvec = imu->bvec; 1099 if (offset >= bvec->bv_len) { 1100 unsigned long seg_skip; 1101 1102 /* skip first vec */ 1103 offset -= bvec->bv_len; 1104 seg_skip = 1 + (offset >> imu->folio_shift); 1105 bvec += seg_skip; 1106 offset &= folio_mask; 1107 } 1108 nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; 1109 iov_iter_bvec(iter, ddir, bvec, nr_segs, len); 1110 iter->iov_offset = offset; 1111 return 0; 1112 } 1113 1114 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 1115 unsigned issue_flags) 1116 { 1117 struct io_ring_ctx *ctx = req->ctx; 1118 struct io_rsrc_node *node; 1119 1120 if (req->flags & REQ_F_BUF_NODE) 1121 return req->buf_node; 1122 req->flags |= REQ_F_BUF_NODE; 1123 1124 io_ring_submit_lock(ctx, issue_flags); 1125 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 1126 if (node) { 1127 node->refs++; 1128 req->buf_node = node; 1129 io_ring_submit_unlock(ctx, issue_flags); 1130 return node; 1131 } 1132 req->flags &= ~REQ_F_BUF_NODE; 1133 io_ring_submit_unlock(ctx, issue_flags); 1134 return NULL; 1135 } 1136 1137 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 1138 u64 buf_addr, size_t len, int ddir, 1139 unsigned issue_flags) 1140 { 1141 struct io_rsrc_node *node; 1142 1143 node = io_find_buf_node(req, issue_flags); 1144 if (!node) 1145 return -EFAULT; 1146 return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1147 } 1148 1149 /* Lock two rings at once. The rings must be different! */ 1150 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 1151 { 1152 if (ctx1 > ctx2) 1153 swap(ctx1, ctx2); 1154 mutex_lock(&ctx1->uring_lock); 1155 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 1156 } 1157 1158 /* Both rings are locked by the caller. */ 1159 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 1160 struct io_uring_clone_buffers *arg) 1161 { 1162 struct io_rsrc_data data; 1163 int i, ret, off, nr; 1164 unsigned int nbufs; 1165 1166 lockdep_assert_held(&ctx->uring_lock); 1167 lockdep_assert_held(&src_ctx->uring_lock); 1168 1169 /* 1170 * Accounting state is shared between the two rings; that only works if 1171 * both rings are accounted towards the same counters. 1172 */ 1173 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 1174 return -EINVAL; 1175 1176 /* if offsets are given, must have nr specified too */ 1177 if (!arg->nr && (arg->dst_off || arg->src_off)) 1178 return -EINVAL; 1179 /* not allowed unless REPLACE is set */ 1180 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 1181 return -EBUSY; 1182 1183 nbufs = src_ctx->buf_table.nr; 1184 if (!arg->nr) 1185 arg->nr = nbufs; 1186 else if (arg->nr > nbufs) 1187 return -EINVAL; 1188 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1189 return -EINVAL; 1190 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1191 return -EOVERFLOW; 1192 if (nbufs > IORING_MAX_REG_BUFFERS) 1193 return -EINVAL; 1194 1195 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 1196 if (ret) 1197 return ret; 1198 1199 /* Fill entries in data from dst that won't overlap with src */ 1200 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1201 struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; 1202 1203 if (src_node) { 1204 data.nodes[i] = src_node; 1205 src_node->refs++; 1206 } 1207 } 1208 1209 ret = -ENXIO; 1210 nbufs = src_ctx->buf_table.nr; 1211 if (!nbufs) 1212 goto out_free; 1213 ret = -EINVAL; 1214 if (!arg->nr) 1215 arg->nr = nbufs; 1216 else if (arg->nr > nbufs) 1217 goto out_free; 1218 ret = -EOVERFLOW; 1219 if (check_add_overflow(arg->nr, arg->src_off, &off)) 1220 goto out_free; 1221 if (off > nbufs) 1222 goto out_free; 1223 1224 off = arg->dst_off; 1225 i = arg->src_off; 1226 nr = arg->nr; 1227 while (nr--) { 1228 struct io_rsrc_node *dst_node, *src_node; 1229 1230 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1231 if (!src_node) { 1232 dst_node = NULL; 1233 } else { 1234 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1235 if (!dst_node) { 1236 ret = -ENOMEM; 1237 goto out_free; 1238 } 1239 1240 refcount_inc(&src_node->buf->refs); 1241 dst_node->buf = src_node->buf; 1242 } 1243 data.nodes[off++] = dst_node; 1244 i++; 1245 } 1246 1247 /* 1248 * If asked for replace, put the old table. data->nodes[] holds both 1249 * old and new nodes at this point. 1250 */ 1251 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1252 io_rsrc_data_free(ctx, &ctx->buf_table); 1253 1254 /* 1255 * ctx->buf_table must be empty now - either the contents are being 1256 * replaced and we just freed the table, or the contents are being 1257 * copied to a ring that does not have buffers yet (checked at function 1258 * entry). 1259 */ 1260 WARN_ON_ONCE(ctx->buf_table.nr); 1261 ctx->buf_table = data; 1262 return 0; 1263 1264 out_free: 1265 io_rsrc_data_free(ctx, &data); 1266 return ret; 1267 } 1268 1269 /* 1270 * Copy the registered buffers from the source ring whose file descriptor 1271 * is given in the src_fd to the current ring. This is identical to registering 1272 * the buffers with ctx, except faster as mappings already exist. 1273 * 1274 * Since the memory is already accounted once, don't account it again. 1275 */ 1276 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1277 { 1278 struct io_uring_clone_buffers buf; 1279 struct io_ring_ctx *src_ctx; 1280 bool registered_src; 1281 struct file *file; 1282 int ret; 1283 1284 if (copy_from_user(&buf, arg, sizeof(buf))) 1285 return -EFAULT; 1286 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1287 return -EINVAL; 1288 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1289 return -EBUSY; 1290 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1291 return -EINVAL; 1292 1293 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1294 file = io_uring_register_get_file(buf.src_fd, registered_src); 1295 if (IS_ERR(file)) 1296 return PTR_ERR(file); 1297 1298 src_ctx = file->private_data; 1299 if (src_ctx != ctx) { 1300 mutex_unlock(&ctx->uring_lock); 1301 lock_two_rings(ctx, src_ctx); 1302 } 1303 1304 ret = io_clone_buffers(ctx, src_ctx, &buf); 1305 1306 if (src_ctx != ctx) 1307 mutex_unlock(&src_ctx->uring_lock); 1308 1309 fput(file); 1310 return ret; 1311 } 1312 1313 void io_vec_free(struct iou_vec *iv) 1314 { 1315 if (!iv->iovec) 1316 return; 1317 kfree(iv->iovec); 1318 iv->iovec = NULL; 1319 iv->nr = 0; 1320 } 1321 1322 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1323 { 1324 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1325 struct iovec *iov; 1326 1327 iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); 1328 if (!iov) 1329 return -ENOMEM; 1330 1331 io_vec_free(iv); 1332 iv->iovec = iov; 1333 iv->nr = nr_entries; 1334 return 0; 1335 } 1336 1337 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, 1338 struct io_mapped_ubuf *imu, 1339 struct iovec *iovec, unsigned nr_iovs, 1340 struct iou_vec *vec) 1341 { 1342 unsigned long folio_size = 1 << imu->folio_shift; 1343 unsigned long folio_mask = folio_size - 1; 1344 struct bio_vec *res_bvec = vec->bvec; 1345 size_t total_len = 0; 1346 unsigned bvec_idx = 0; 1347 unsigned iov_idx; 1348 1349 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1350 size_t iov_len = iovec[iov_idx].iov_len; 1351 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; 1352 struct bio_vec *src_bvec; 1353 size_t offset; 1354 int ret; 1355 1356 ret = validate_fixed_range(buf_addr, iov_len, imu); 1357 if (unlikely(ret)) 1358 return ret; 1359 1360 if (unlikely(!iov_len)) 1361 return -EFAULT; 1362 if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) 1363 return -EOVERFLOW; 1364 1365 offset = buf_addr - imu->ubuf; 1366 /* 1367 * Only the first bvec can have non zero bv_offset, account it 1368 * here and work with full folios below. 1369 */ 1370 offset += imu->bvec[0].bv_offset; 1371 1372 src_bvec = imu->bvec + (offset >> imu->folio_shift); 1373 offset &= folio_mask; 1374 1375 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { 1376 size_t seg_size = min_t(size_t, iov_len, 1377 folio_size - offset); 1378 1379 bvec_set_page(&res_bvec[bvec_idx], 1380 src_bvec->bv_page, seg_size, offset); 1381 iov_len -= seg_size; 1382 } 1383 } 1384 if (total_len > MAX_RW_COUNT) 1385 return -EINVAL; 1386 1387 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); 1388 return 0; 1389 } 1390 1391 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, 1392 struct io_mapped_ubuf *imu) 1393 { 1394 unsigned shift = imu->folio_shift; 1395 size_t max_segs = 0; 1396 unsigned i; 1397 1398 for (i = 0; i < nr_iovs; i++) 1399 max_segs += (iov[i].iov_len >> shift) + 2; 1400 return max_segs; 1401 } 1402 1403 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, 1404 struct io_mapped_ubuf *imu, 1405 struct iovec *iovec, unsigned nr_iovs, 1406 struct iou_vec *vec) 1407 { 1408 const struct bio_vec *src_bvec = imu->bvec; 1409 struct bio_vec *res_bvec = vec->bvec; 1410 unsigned res_idx = 0; 1411 size_t total_len = 0; 1412 unsigned iov_idx; 1413 1414 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1415 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; 1416 size_t iov_len = iovec[iov_idx].iov_len; 1417 struct bvec_iter bi = { 1418 .bi_size = offset + iov_len, 1419 }; 1420 struct bio_vec bv; 1421 1422 bvec_iter_advance(src_bvec, &bi, offset); 1423 for_each_mp_bvec(bv, src_bvec, bi, bi) 1424 res_bvec[res_idx++] = bv; 1425 total_len += iov_len; 1426 } 1427 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); 1428 return 0; 1429 } 1430 1431 static int iov_kern_bvec_size(const struct iovec *iov, 1432 const struct io_mapped_ubuf *imu, 1433 unsigned int *nr_seg) 1434 { 1435 size_t offset = (size_t)(uintptr_t)iov->iov_base; 1436 const struct bio_vec *bvec = imu->bvec; 1437 int start = 0, i = 0; 1438 size_t off = 0; 1439 int ret; 1440 1441 ret = validate_fixed_range(offset, iov->iov_len, imu); 1442 if (unlikely(ret)) 1443 return ret; 1444 1445 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; 1446 off += bvec[i].bv_len, i++) { 1447 if (offset >= off && offset < off + bvec[i].bv_len) 1448 start = i; 1449 } 1450 *nr_seg = i - start; 1451 return 0; 1452 } 1453 1454 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, 1455 struct io_mapped_ubuf *imu, unsigned *nr_segs) 1456 { 1457 unsigned max_segs = 0; 1458 size_t total_len = 0; 1459 unsigned i; 1460 int ret; 1461 1462 *nr_segs = 0; 1463 for (i = 0; i < nr_iovs; i++) { 1464 if (unlikely(!iov[i].iov_len)) 1465 return -EFAULT; 1466 if (unlikely(check_add_overflow(total_len, iov[i].iov_len, 1467 &total_len))) 1468 return -EOVERFLOW; 1469 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); 1470 if (unlikely(ret)) 1471 return ret; 1472 *nr_segs += max_segs; 1473 } 1474 if (total_len > MAX_RW_COUNT) 1475 return -EINVAL; 1476 return 0; 1477 } 1478 1479 int io_import_reg_vec(int ddir, struct iov_iter *iter, 1480 struct io_kiocb *req, struct iou_vec *vec, 1481 unsigned nr_iovs, unsigned issue_flags) 1482 { 1483 struct io_rsrc_node *node; 1484 struct io_mapped_ubuf *imu; 1485 unsigned iovec_off; 1486 struct iovec *iov; 1487 unsigned nr_segs; 1488 1489 node = io_find_buf_node(req, issue_flags); 1490 if (!node) 1491 return -EFAULT; 1492 imu = node->buf; 1493 if (!(imu->dir & (1 << ddir))) 1494 return -EFAULT; 1495 1496 iovec_off = vec->nr - nr_iovs; 1497 iov = vec->iovec + iovec_off; 1498 1499 if (imu->is_kbuf) { 1500 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); 1501 1502 if (unlikely(ret)) 1503 return ret; 1504 } else { 1505 nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); 1506 } 1507 1508 if (sizeof(struct bio_vec) > sizeof(struct iovec)) { 1509 size_t bvec_bytes; 1510 1511 bvec_bytes = nr_segs * sizeof(struct bio_vec); 1512 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); 1513 nr_segs += nr_iovs; 1514 } 1515 1516 if (nr_segs > vec->nr) { 1517 struct iou_vec tmp_vec = {}; 1518 int ret; 1519 1520 ret = io_vec_realloc(&tmp_vec, nr_segs); 1521 if (ret) 1522 return ret; 1523 1524 iovec_off = tmp_vec.nr - nr_iovs; 1525 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); 1526 io_vec_free(vec); 1527 1528 *vec = tmp_vec; 1529 iov = vec->iovec + iovec_off; 1530 req->flags |= REQ_F_NEED_CLEANUP; 1531 } 1532 1533 if (imu->is_kbuf) 1534 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1535 1536 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1537 } 1538 1539 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, 1540 const struct iovec __user *uvec, size_t uvec_segs) 1541 { 1542 struct iovec *iov; 1543 int iovec_off, ret; 1544 void *res; 1545 1546 if (uvec_segs > iv->nr) { 1547 ret = io_vec_realloc(iv, uvec_segs); 1548 if (ret) 1549 return ret; 1550 req->flags |= REQ_F_NEED_CLEANUP; 1551 } 1552 1553 /* pad iovec to the right */ 1554 iovec_off = iv->nr - uvec_segs; 1555 iov = iv->iovec + iovec_off; 1556 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, 1557 io_is_compat(req->ctx)); 1558 if (IS_ERR(res)) 1559 return PTR_ERR(res); 1560 1561 req->flags |= REQ_F_IMPORT_BUFFER; 1562 return 0; 1563 } 1564