1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 #include <linux/io_uring/cmd.h> 13 14 #include <uapi/linux/io_uring.h> 15 16 #include "io_uring.h" 17 #include "openclose.h" 18 #include "rsrc.h" 19 #include "memmap.h" 20 #include "register.h" 21 22 struct io_rsrc_update { 23 struct file *file; 24 u64 arg; 25 u32 nr_args; 26 u32 offset; 27 }; 28 29 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 30 struct iovec *iov, struct page **last_hpage); 31 32 /* only define max */ 33 #define IORING_MAX_FIXED_FILES (1U << 20) 34 #define IORING_MAX_REG_BUFFERS (1U << 14) 35 36 #define IO_CACHED_BVECS_SEGS 32 37 38 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 39 { 40 unsigned long page_limit, cur_pages, new_pages; 41 42 if (!nr_pages) 43 return 0; 44 45 /* Don't allow more pages than we can safely lock */ 46 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 47 48 cur_pages = atomic_long_read(&user->locked_vm); 49 do { 50 new_pages = cur_pages + nr_pages; 51 if (new_pages > page_limit) 52 return -ENOMEM; 53 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 54 &cur_pages, new_pages)); 55 return 0; 56 } 57 58 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 59 { 60 if (ctx->user) 61 __io_unaccount_mem(ctx->user, nr_pages); 62 63 if (ctx->mm_account) 64 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 65 } 66 67 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 68 { 69 int ret; 70 71 if (ctx->user) { 72 ret = __io_account_mem(ctx->user, nr_pages); 73 if (ret) 74 return ret; 75 } 76 77 if (ctx->mm_account) 78 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 79 80 return 0; 81 } 82 83 int io_validate_user_buf_range(u64 uaddr, u64 ulen) 84 { 85 unsigned long tmp, base = (unsigned long)uaddr; 86 unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); 87 88 /* arbitrary limit, but we need something */ 89 if (ulen > SZ_1G || !ulen) 90 return -EFAULT; 91 if (check_add_overflow(base, acct_len, &tmp)) 92 return -EOVERFLOW; 93 return 0; 94 } 95 96 static int io_buffer_validate(struct iovec *iov) 97 { 98 /* 99 * Don't impose further limits on the size and buffer 100 * constraints here, we'll -EINVAL later when IO is 101 * submitted if they are wrong. 102 */ 103 if (!iov->iov_base) 104 return iov->iov_len ? -EFAULT : 0; 105 106 return io_validate_user_buf_range((unsigned long)iov->iov_base, 107 iov->iov_len); 108 } 109 110 static void io_release_ubuf(void *priv) 111 { 112 struct io_mapped_ubuf *imu = priv; 113 unsigned int i; 114 115 for (i = 0; i < imu->nr_bvecs; i++) { 116 struct folio *folio = page_folio(imu->bvec[i].bv_page); 117 118 unpin_user_folio(folio, 1); 119 } 120 } 121 122 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 123 int nr_bvecs) 124 { 125 if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 126 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 127 return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), 128 GFP_KERNEL); 129 } 130 131 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 132 { 133 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 134 io_cache_free(&ctx->imu_cache, imu); 135 else 136 kvfree(imu); 137 } 138 139 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 140 { 141 if (!refcount_dec_and_test(&imu->refs)) 142 return; 143 144 if (imu->acct_pages) 145 io_unaccount_mem(ctx, imu->acct_pages); 146 imu->release(imu->priv); 147 io_free_imu(ctx, imu); 148 } 149 150 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 151 { 152 struct io_rsrc_node *node; 153 154 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 155 if (node) { 156 node->type = type; 157 node->refs = 1; 158 node->tag = 0; 159 node->file_ptr = 0; 160 } 161 return node; 162 } 163 164 bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 165 { 166 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 167 IO_CACHED_BVECS_SEGS); 168 const int node_size = sizeof(struct io_rsrc_node); 169 bool ret; 170 171 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 172 node_size, 0); 173 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 174 imu_cache_size, 0); 175 return ret; 176 } 177 178 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 179 { 180 io_alloc_cache_free(&ctx->node_cache, kfree); 181 io_alloc_cache_free(&ctx->imu_cache, kfree); 182 } 183 184 static void io_clear_table_tags(struct io_rsrc_data *data) 185 { 186 int i; 187 188 for (i = 0; i < data->nr; i++) { 189 struct io_rsrc_node *node = data->nodes[i]; 190 191 if (node) 192 node->tag = 0; 193 } 194 } 195 196 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 197 struct io_rsrc_data *data) 198 { 199 if (!data->nr) 200 return; 201 while (data->nr--) { 202 if (data->nodes[data->nr]) 203 io_put_rsrc_node(ctx, data->nodes[data->nr]); 204 } 205 kvfree(data->nodes); 206 data->nodes = NULL; 207 data->nr = 0; 208 } 209 210 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 211 { 212 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), 213 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 214 if (data->nodes) { 215 data->nr = nr; 216 return 0; 217 } 218 return -ENOMEM; 219 } 220 221 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 222 struct io_uring_rsrc_update2 *up, 223 unsigned nr_args) 224 { 225 u64 __user *tags = u64_to_user_ptr(up->tags); 226 __s32 __user *fds = u64_to_user_ptr(up->data); 227 int fd, i, err = 0; 228 unsigned int done; 229 230 if (!ctx->file_table.data.nr) 231 return -ENXIO; 232 if (up->offset + nr_args > ctx->file_table.data.nr) 233 return -EINVAL; 234 235 for (done = 0; done < nr_args; done++) { 236 u64 tag = 0; 237 238 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 239 copy_from_user(&fd, &fds[done], sizeof(fd))) { 240 err = -EFAULT; 241 break; 242 } 243 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 244 err = -EINVAL; 245 break; 246 } 247 if (fd == IORING_REGISTER_FILES_SKIP) 248 continue; 249 250 i = up->offset + done; 251 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 252 io_file_bitmap_clear(&ctx->file_table, i); 253 254 if (fd != -1) { 255 struct file *file = fget(fd); 256 struct io_rsrc_node *node; 257 258 if (!file) { 259 err = -EBADF; 260 break; 261 } 262 /* 263 * Don't allow io_uring instances to be registered. 264 */ 265 if (io_is_uring_fops(file)) { 266 fput(file); 267 err = -EBADF; 268 break; 269 } 270 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 271 if (!node) { 272 err = -ENOMEM; 273 fput(file); 274 break; 275 } 276 ctx->file_table.data.nodes[i] = node; 277 if (tag) 278 node->tag = tag; 279 io_fixed_file_set(node, file); 280 io_file_bitmap_set(&ctx->file_table, i); 281 } 282 } 283 return done ? done : err; 284 } 285 286 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 287 struct io_uring_rsrc_update2 *up, 288 unsigned int nr_args) 289 { 290 u64 __user *tags = u64_to_user_ptr(up->tags); 291 struct iovec fast_iov, *iov; 292 struct page *last_hpage = NULL; 293 struct iovec __user *uvec; 294 u64 user_data = up->data; 295 __u32 done; 296 int i, err; 297 298 if (!ctx->buf_table.nr) 299 return -ENXIO; 300 if (up->offset + nr_args > ctx->buf_table.nr) 301 return -EINVAL; 302 303 for (done = 0; done < nr_args; done++) { 304 struct io_rsrc_node *node; 305 u64 tag = 0; 306 307 uvec = u64_to_user_ptr(user_data); 308 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 309 if (IS_ERR(iov)) { 310 err = PTR_ERR(iov); 311 break; 312 } 313 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 314 err = -EFAULT; 315 break; 316 } 317 err = io_buffer_validate(iov); 318 if (err) 319 break; 320 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 321 if (IS_ERR(node)) { 322 err = PTR_ERR(node); 323 break; 324 } 325 if (tag) { 326 if (!node) { 327 err = -EINVAL; 328 break; 329 } 330 node->tag = tag; 331 } 332 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 333 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 334 ctx->buf_table.nodes[i] = node; 335 if (ctx->compat) 336 user_data += sizeof(struct compat_iovec); 337 else 338 user_data += sizeof(struct iovec); 339 } 340 return done ? done : err; 341 } 342 343 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 344 struct io_uring_rsrc_update2 *up, 345 unsigned nr_args) 346 { 347 __u32 tmp; 348 349 lockdep_assert_held(&ctx->uring_lock); 350 351 if (check_add_overflow(up->offset, nr_args, &tmp)) 352 return -EOVERFLOW; 353 354 switch (type) { 355 case IORING_RSRC_FILE: 356 return __io_sqe_files_update(ctx, up, nr_args); 357 case IORING_RSRC_BUFFER: 358 return __io_sqe_buffers_update(ctx, up, nr_args); 359 } 360 return -EINVAL; 361 } 362 363 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 364 unsigned nr_args) 365 { 366 struct io_uring_rsrc_update2 up; 367 368 if (!nr_args) 369 return -EINVAL; 370 memset(&up, 0, sizeof(up)); 371 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 372 return -EFAULT; 373 if (up.resv || up.resv2) 374 return -EINVAL; 375 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 376 } 377 378 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 379 unsigned size, unsigned type) 380 { 381 struct io_uring_rsrc_update2 up; 382 383 if (size != sizeof(up)) 384 return -EINVAL; 385 if (copy_from_user(&up, arg, sizeof(up))) 386 return -EFAULT; 387 if (!up.nr || up.resv || up.resv2) 388 return -EINVAL; 389 return __io_register_rsrc_update(ctx, type, &up, up.nr); 390 } 391 392 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 393 unsigned int size, unsigned int type) 394 { 395 struct io_uring_rsrc_register rr; 396 397 /* keep it extendible */ 398 if (size != sizeof(rr)) 399 return -EINVAL; 400 401 memset(&rr, 0, sizeof(rr)); 402 if (copy_from_user(&rr, arg, size)) 403 return -EFAULT; 404 if (!rr.nr || rr.resv2) 405 return -EINVAL; 406 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 407 return -EINVAL; 408 409 switch (type) { 410 case IORING_RSRC_FILE: 411 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 412 break; 413 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 414 rr.nr, u64_to_user_ptr(rr.tags)); 415 case IORING_RSRC_BUFFER: 416 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 417 break; 418 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 419 rr.nr, u64_to_user_ptr(rr.tags)); 420 } 421 return -EINVAL; 422 } 423 424 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 425 { 426 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 427 428 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 429 return -EINVAL; 430 if (sqe->rw_flags || sqe->splice_fd_in) 431 return -EINVAL; 432 433 up->offset = READ_ONCE(sqe->off); 434 up->nr_args = READ_ONCE(sqe->len); 435 if (!up->nr_args) 436 return -EINVAL; 437 up->arg = READ_ONCE(sqe->addr); 438 return 0; 439 } 440 441 static int io_files_update_with_index_alloc(struct io_kiocb *req, 442 unsigned int issue_flags) 443 { 444 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 445 __s32 __user *fds = u64_to_user_ptr(up->arg); 446 unsigned int done; 447 struct file *file; 448 int ret, fd; 449 450 if (!req->ctx->file_table.data.nr) 451 return -ENXIO; 452 453 for (done = 0; done < up->nr_args; done++) { 454 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 455 ret = -EFAULT; 456 break; 457 } 458 459 file = fget(fd); 460 if (!file) { 461 ret = -EBADF; 462 break; 463 } 464 ret = io_fixed_fd_install(req, issue_flags, file, 465 IORING_FILE_INDEX_ALLOC); 466 if (ret < 0) 467 break; 468 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 469 __io_close_fixed(req->ctx, issue_flags, ret); 470 ret = -EFAULT; 471 break; 472 } 473 } 474 475 if (done) 476 return done; 477 return ret; 478 } 479 480 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 481 { 482 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 483 struct io_ring_ctx *ctx = req->ctx; 484 struct io_uring_rsrc_update2 up2; 485 int ret; 486 487 up2.offset = up->offset; 488 up2.data = up->arg; 489 up2.nr = 0; 490 up2.tags = 0; 491 up2.resv = 0; 492 up2.resv2 = 0; 493 494 if (up->offset == IORING_FILE_INDEX_ALLOC) { 495 ret = io_files_update_with_index_alloc(req, issue_flags); 496 } else { 497 io_ring_submit_lock(ctx, issue_flags); 498 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 499 &up2, up->nr_args); 500 io_ring_submit_unlock(ctx, issue_flags); 501 } 502 503 if (ret < 0) 504 req_set_fail(req); 505 io_req_set_res(req, ret, 0); 506 return IOU_COMPLETE; 507 } 508 509 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 510 { 511 if (node->tag) 512 io_post_aux_cqe(ctx, node->tag, 0, 0); 513 514 switch (node->type) { 515 case IORING_RSRC_FILE: 516 fput(io_slot_file(node)); 517 break; 518 case IORING_RSRC_BUFFER: 519 io_buffer_unmap(ctx, node->buf); 520 break; 521 default: 522 WARN_ON_ONCE(1); 523 break; 524 } 525 526 io_cache_free(&ctx->node_cache, node); 527 } 528 529 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 530 { 531 if (!ctx->file_table.data.nr) 532 return -ENXIO; 533 534 io_free_file_tables(ctx, &ctx->file_table); 535 io_file_table_set_alloc_range(ctx, 0, 0); 536 return 0; 537 } 538 539 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 540 unsigned nr_args, u64 __user *tags) 541 { 542 __s32 __user *fds = (__s32 __user *) arg; 543 struct file *file; 544 int fd, ret; 545 unsigned i; 546 547 if (ctx->file_table.data.nr) 548 return -EBUSY; 549 if (!nr_args) 550 return -EINVAL; 551 if (nr_args > IORING_MAX_FIXED_FILES) 552 return -EMFILE; 553 if (nr_args > rlimit(RLIMIT_NOFILE)) 554 return -EMFILE; 555 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 556 return -ENOMEM; 557 558 for (i = 0; i < nr_args; i++) { 559 struct io_rsrc_node *node; 560 u64 tag = 0; 561 562 ret = -EFAULT; 563 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 564 goto fail; 565 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 566 goto fail; 567 /* allow sparse sets */ 568 if (!fds || fd == -1) { 569 ret = -EINVAL; 570 if (tag) 571 goto fail; 572 continue; 573 } 574 575 file = fget(fd); 576 ret = -EBADF; 577 if (unlikely(!file)) 578 goto fail; 579 580 /* 581 * Don't allow io_uring instances to be registered. 582 */ 583 if (io_is_uring_fops(file)) { 584 fput(file); 585 goto fail; 586 } 587 ret = -ENOMEM; 588 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 589 if (!node) { 590 fput(file); 591 goto fail; 592 } 593 if (tag) 594 node->tag = tag; 595 ctx->file_table.data.nodes[i] = node; 596 io_fixed_file_set(node, file); 597 io_file_bitmap_set(&ctx->file_table, i); 598 } 599 600 /* default it to the whole table */ 601 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 602 return 0; 603 fail: 604 io_clear_table_tags(&ctx->file_table.data); 605 io_sqe_files_unregister(ctx); 606 return ret; 607 } 608 609 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 610 { 611 if (!ctx->buf_table.nr) 612 return -ENXIO; 613 io_rsrc_data_free(ctx, &ctx->buf_table); 614 return 0; 615 } 616 617 /* 618 * Not super efficient, but this is just a registration time. And we do cache 619 * the last compound head, so generally we'll only do a full search if we don't 620 * match that one. 621 * 622 * We check if the given compound head page has already been accounted, to 623 * avoid double accounting it. This allows us to account the full size of the 624 * page, not just the constituent pages of a huge page. 625 */ 626 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 627 int nr_pages, struct page *hpage) 628 { 629 int i, j; 630 631 /* check current page array */ 632 for (i = 0; i < nr_pages; i++) { 633 if (!PageCompound(pages[i])) 634 continue; 635 if (compound_head(pages[i]) == hpage) 636 return true; 637 } 638 639 /* check previously registered pages */ 640 for (i = 0; i < ctx->buf_table.nr; i++) { 641 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 642 struct io_mapped_ubuf *imu; 643 644 if (!node) 645 continue; 646 imu = node->buf; 647 for (j = 0; j < imu->nr_bvecs; j++) { 648 if (!PageCompound(imu->bvec[j].bv_page)) 649 continue; 650 if (compound_head(imu->bvec[j].bv_page) == hpage) 651 return true; 652 } 653 } 654 655 return false; 656 } 657 658 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 659 int nr_pages, struct io_mapped_ubuf *imu, 660 struct page **last_hpage) 661 { 662 int i, ret; 663 664 imu->acct_pages = 0; 665 for (i = 0; i < nr_pages; i++) { 666 if (!PageCompound(pages[i])) { 667 imu->acct_pages++; 668 } else { 669 struct page *hpage; 670 671 hpage = compound_head(pages[i]); 672 if (hpage == *last_hpage) 673 continue; 674 *last_hpage = hpage; 675 if (headpage_already_acct(ctx, pages, i, hpage)) 676 continue; 677 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 678 } 679 } 680 681 if (!imu->acct_pages) 682 return 0; 683 684 ret = io_account_mem(ctx, imu->acct_pages); 685 if (ret) 686 imu->acct_pages = 0; 687 return ret; 688 } 689 690 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 691 struct io_imu_folio_data *data) 692 { 693 struct page **page_array = *pages, **new_array = NULL; 694 unsigned nr_pages_left = *nr_pages; 695 unsigned nr_folios = data->nr_folios; 696 unsigned i, j; 697 698 /* Store head pages only*/ 699 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); 700 if (!new_array) 701 return false; 702 703 for (i = 0, j = 0; i < nr_folios; i++) { 704 struct page *p = compound_head(page_array[j]); 705 struct folio *folio = page_folio(p); 706 unsigned int nr; 707 708 WARN_ON_ONCE(i > 0 && p != page_array[j]); 709 710 nr = i ? data->nr_pages_mid : data->nr_pages_head; 711 nr = min(nr, nr_pages_left); 712 /* Drop all but one ref, the entire folio will remain pinned. */ 713 if (nr > 1) 714 unpin_user_folio(folio, nr - 1); 715 j += nr; 716 nr_pages_left -= nr; 717 new_array[i] = p; 718 } 719 720 WARN_ON_ONCE(j != *nr_pages); 721 722 kvfree(page_array); 723 *pages = new_array; 724 *nr_pages = nr_folios; 725 return true; 726 } 727 728 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 729 struct io_imu_folio_data *data) 730 { 731 struct folio *folio = page_folio(page_array[0]); 732 unsigned int count = 1, nr_folios = 1; 733 int i; 734 735 data->nr_pages_mid = folio_nr_pages(folio); 736 data->folio_shift = folio_shift(folio); 737 data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); 738 739 /* 740 * Check if pages are contiguous inside a folio, and all folios have 741 * the same page count except for the head and tail. 742 */ 743 for (i = 1; i < nr_pages; i++) { 744 if (page_folio(page_array[i]) == folio && 745 page_array[i] == page_array[i-1] + 1) { 746 count++; 747 continue; 748 } 749 750 if (nr_folios == 1) { 751 if (folio_page_idx(folio, page_array[i-1]) != 752 data->nr_pages_mid - 1) 753 return false; 754 755 data->nr_pages_head = count; 756 } else if (count != data->nr_pages_mid) { 757 return false; 758 } 759 760 folio = page_folio(page_array[i]); 761 if (folio_size(folio) != (1UL << data->folio_shift) || 762 folio_page_idx(folio, page_array[i]) != 0) 763 return false; 764 765 count = 1; 766 nr_folios++; 767 } 768 if (nr_folios == 1) 769 data->nr_pages_head = count; 770 771 data->nr_folios = nr_folios; 772 return true; 773 } 774 775 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 776 struct iovec *iov, 777 struct page **last_hpage) 778 { 779 struct io_mapped_ubuf *imu = NULL; 780 struct page **pages = NULL; 781 struct io_rsrc_node *node; 782 unsigned long off; 783 size_t size; 784 int ret, nr_pages, i; 785 struct io_imu_folio_data data; 786 bool coalesced = false; 787 788 if (!iov->iov_base) 789 return NULL; 790 791 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 792 if (!node) 793 return ERR_PTR(-ENOMEM); 794 795 ret = -ENOMEM; 796 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 797 &nr_pages); 798 if (IS_ERR(pages)) { 799 ret = PTR_ERR(pages); 800 pages = NULL; 801 goto done; 802 } 803 804 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 805 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 806 if (data.nr_pages_mid != 1) 807 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 808 } 809 810 imu = io_alloc_imu(ctx, nr_pages); 811 if (!imu) 812 goto done; 813 814 imu->nr_bvecs = nr_pages; 815 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 816 if (ret) 817 goto done; 818 819 size = iov->iov_len; 820 /* store original address for later verification */ 821 imu->ubuf = (unsigned long) iov->iov_base; 822 imu->len = iov->iov_len; 823 imu->folio_shift = PAGE_SHIFT; 824 imu->release = io_release_ubuf; 825 imu->priv = imu; 826 imu->is_kbuf = false; 827 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 828 if (coalesced) 829 imu->folio_shift = data.folio_shift; 830 refcount_set(&imu->refs, 1); 831 832 off = (unsigned long)iov->iov_base & ~PAGE_MASK; 833 if (coalesced) 834 off += data.first_folio_page_idx << PAGE_SHIFT; 835 836 node->buf = imu; 837 ret = 0; 838 839 for (i = 0; i < nr_pages; i++) { 840 size_t vec_len; 841 842 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 843 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 844 off = 0; 845 size -= vec_len; 846 } 847 done: 848 if (ret) { 849 if (imu) 850 io_free_imu(ctx, imu); 851 if (pages) { 852 for (i = 0; i < nr_pages; i++) 853 unpin_user_folio(page_folio(pages[i]), 1); 854 } 855 io_cache_free(&ctx->node_cache, node); 856 node = ERR_PTR(ret); 857 } 858 kvfree(pages); 859 return node; 860 } 861 862 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 863 unsigned int nr_args, u64 __user *tags) 864 { 865 struct page *last_hpage = NULL; 866 struct io_rsrc_data data; 867 struct iovec fast_iov, *iov = &fast_iov; 868 const struct iovec __user *uvec; 869 int i, ret; 870 871 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 872 873 if (ctx->buf_table.nr) 874 return -EBUSY; 875 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 876 return -EINVAL; 877 ret = io_rsrc_data_alloc(&data, nr_args); 878 if (ret) 879 return ret; 880 881 if (!arg) 882 memset(iov, 0, sizeof(*iov)); 883 884 for (i = 0; i < nr_args; i++) { 885 struct io_rsrc_node *node; 886 u64 tag = 0; 887 888 if (arg) { 889 uvec = (struct iovec __user *) arg; 890 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 891 if (IS_ERR(iov)) { 892 ret = PTR_ERR(iov); 893 break; 894 } 895 ret = io_buffer_validate(iov); 896 if (ret) 897 break; 898 if (ctx->compat) 899 arg += sizeof(struct compat_iovec); 900 else 901 arg += sizeof(struct iovec); 902 } 903 904 if (tags) { 905 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 906 ret = -EFAULT; 907 break; 908 } 909 } 910 911 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 912 if (IS_ERR(node)) { 913 ret = PTR_ERR(node); 914 break; 915 } 916 if (tag) { 917 if (!node) { 918 ret = -EINVAL; 919 break; 920 } 921 node->tag = tag; 922 } 923 data.nodes[i] = node; 924 } 925 926 ctx->buf_table = data; 927 if (ret) { 928 io_clear_table_tags(&ctx->buf_table); 929 io_sqe_buffers_unregister(ctx); 930 } 931 return ret; 932 } 933 934 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 935 void (*release)(void *), unsigned int index, 936 unsigned int issue_flags) 937 { 938 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 939 struct io_rsrc_data *data = &ctx->buf_table; 940 struct req_iterator rq_iter; 941 struct io_mapped_ubuf *imu; 942 struct io_rsrc_node *node; 943 struct bio_vec bv, *bvec; 944 u16 nr_bvecs; 945 int ret = 0; 946 947 io_ring_submit_lock(ctx, issue_flags); 948 if (index >= data->nr) { 949 ret = -EINVAL; 950 goto unlock; 951 } 952 index = array_index_nospec(index, data->nr); 953 954 if (data->nodes[index]) { 955 ret = -EBUSY; 956 goto unlock; 957 } 958 959 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 960 if (!node) { 961 ret = -ENOMEM; 962 goto unlock; 963 } 964 965 nr_bvecs = blk_rq_nr_phys_segments(rq); 966 imu = io_alloc_imu(ctx, nr_bvecs); 967 if (!imu) { 968 kfree(node); 969 ret = -ENOMEM; 970 goto unlock; 971 } 972 973 imu->ubuf = 0; 974 imu->len = blk_rq_bytes(rq); 975 imu->acct_pages = 0; 976 imu->folio_shift = PAGE_SHIFT; 977 imu->nr_bvecs = nr_bvecs; 978 refcount_set(&imu->refs, 1); 979 imu->release = release; 980 imu->priv = rq; 981 imu->is_kbuf = true; 982 imu->dir = 1 << rq_data_dir(rq); 983 984 bvec = imu->bvec; 985 rq_for_each_bvec(bv, rq, rq_iter) 986 *bvec++ = bv; 987 988 node->buf = imu; 989 data->nodes[index] = node; 990 unlock: 991 io_ring_submit_unlock(ctx, issue_flags); 992 return ret; 993 } 994 EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 995 996 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 997 unsigned int issue_flags) 998 { 999 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 1000 struct io_rsrc_data *data = &ctx->buf_table; 1001 struct io_rsrc_node *node; 1002 int ret = 0; 1003 1004 io_ring_submit_lock(ctx, issue_flags); 1005 if (index >= data->nr) { 1006 ret = -EINVAL; 1007 goto unlock; 1008 } 1009 index = array_index_nospec(index, data->nr); 1010 1011 node = data->nodes[index]; 1012 if (!node) { 1013 ret = -EINVAL; 1014 goto unlock; 1015 } 1016 if (!node->buf->is_kbuf) { 1017 ret = -EBUSY; 1018 goto unlock; 1019 } 1020 1021 io_put_rsrc_node(ctx, node); 1022 data->nodes[index] = NULL; 1023 unlock: 1024 io_ring_submit_unlock(ctx, issue_flags); 1025 return ret; 1026 } 1027 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 1028 1029 static int validate_fixed_range(u64 buf_addr, size_t len, 1030 const struct io_mapped_ubuf *imu) 1031 { 1032 u64 buf_end; 1033 1034 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1035 return -EFAULT; 1036 /* not inside the mapped region */ 1037 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1038 return -EFAULT; 1039 if (unlikely(len > MAX_RW_COUNT)) 1040 return -EFAULT; 1041 return 0; 1042 } 1043 1044 static int io_import_kbuf(int ddir, struct iov_iter *iter, 1045 struct io_mapped_ubuf *imu, size_t len, size_t offset) 1046 { 1047 size_t count = len + offset; 1048 1049 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); 1050 iov_iter_advance(iter, offset); 1051 1052 if (count < imu->len) { 1053 const struct bio_vec *bvec = iter->bvec; 1054 1055 while (len > bvec->bv_len) { 1056 len -= bvec->bv_len; 1057 bvec++; 1058 } 1059 iter->nr_segs = 1 + bvec - iter->bvec; 1060 } 1061 return 0; 1062 } 1063 1064 static int io_import_fixed(int ddir, struct iov_iter *iter, 1065 struct io_mapped_ubuf *imu, 1066 u64 buf_addr, size_t len) 1067 { 1068 const struct bio_vec *bvec; 1069 size_t folio_mask; 1070 unsigned nr_segs; 1071 size_t offset; 1072 int ret; 1073 1074 ret = validate_fixed_range(buf_addr, len, imu); 1075 if (unlikely(ret)) 1076 return ret; 1077 if (!(imu->dir & (1 << ddir))) 1078 return -EFAULT; 1079 1080 offset = buf_addr - imu->ubuf; 1081 1082 if (imu->is_kbuf) 1083 return io_import_kbuf(ddir, iter, imu, len, offset); 1084 1085 /* 1086 * Don't use iov_iter_advance() here, as it's really slow for 1087 * using the latter parts of a big fixed buffer - it iterates 1088 * over each segment manually. We can cheat a bit here for user 1089 * registered nodes, because we know that: 1090 * 1091 * 1) it's a BVEC iter, we set it up 1092 * 2) all bvecs are the same in size, except potentially the 1093 * first and last bvec 1094 */ 1095 folio_mask = (1UL << imu->folio_shift) - 1; 1096 bvec = imu->bvec; 1097 if (offset >= bvec->bv_len) { 1098 unsigned long seg_skip; 1099 1100 /* skip first vec */ 1101 offset -= bvec->bv_len; 1102 seg_skip = 1 + (offset >> imu->folio_shift); 1103 bvec += seg_skip; 1104 offset &= folio_mask; 1105 } 1106 nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; 1107 iov_iter_bvec(iter, ddir, bvec, nr_segs, len); 1108 iter->iov_offset = offset; 1109 return 0; 1110 } 1111 1112 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 1113 unsigned issue_flags) 1114 { 1115 struct io_ring_ctx *ctx = req->ctx; 1116 struct io_rsrc_node *node; 1117 1118 if (req->flags & REQ_F_BUF_NODE) 1119 return req->buf_node; 1120 req->flags |= REQ_F_BUF_NODE; 1121 1122 io_ring_submit_lock(ctx, issue_flags); 1123 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 1124 if (node) { 1125 node->refs++; 1126 req->buf_node = node; 1127 io_ring_submit_unlock(ctx, issue_flags); 1128 return node; 1129 } 1130 req->flags &= ~REQ_F_BUF_NODE; 1131 io_ring_submit_unlock(ctx, issue_flags); 1132 return NULL; 1133 } 1134 1135 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 1136 u64 buf_addr, size_t len, int ddir, 1137 unsigned issue_flags) 1138 { 1139 struct io_rsrc_node *node; 1140 1141 node = io_find_buf_node(req, issue_flags); 1142 if (!node) 1143 return -EFAULT; 1144 return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1145 } 1146 1147 /* Lock two rings at once. The rings must be different! */ 1148 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 1149 { 1150 if (ctx1 > ctx2) 1151 swap(ctx1, ctx2); 1152 mutex_lock(&ctx1->uring_lock); 1153 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 1154 } 1155 1156 /* Both rings are locked by the caller. */ 1157 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 1158 struct io_uring_clone_buffers *arg) 1159 { 1160 struct io_rsrc_data data; 1161 int i, ret, off, nr; 1162 unsigned int nbufs; 1163 1164 lockdep_assert_held(&ctx->uring_lock); 1165 lockdep_assert_held(&src_ctx->uring_lock); 1166 1167 /* 1168 * Accounting state is shared between the two rings; that only works if 1169 * both rings are accounted towards the same counters. 1170 */ 1171 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 1172 return -EINVAL; 1173 1174 /* if offsets are given, must have nr specified too */ 1175 if (!arg->nr && (arg->dst_off || arg->src_off)) 1176 return -EINVAL; 1177 /* not allowed unless REPLACE is set */ 1178 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 1179 return -EBUSY; 1180 1181 nbufs = src_ctx->buf_table.nr; 1182 if (!arg->nr) 1183 arg->nr = nbufs; 1184 else if (arg->nr > nbufs) 1185 return -EINVAL; 1186 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1187 return -EINVAL; 1188 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1189 return -EOVERFLOW; 1190 if (nbufs > IORING_MAX_REG_BUFFERS) 1191 return -EINVAL; 1192 1193 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 1194 if (ret) 1195 return ret; 1196 1197 /* Fill entries in data from dst that won't overlap with src */ 1198 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1199 struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; 1200 1201 if (src_node) { 1202 data.nodes[i] = src_node; 1203 src_node->refs++; 1204 } 1205 } 1206 1207 ret = -ENXIO; 1208 nbufs = src_ctx->buf_table.nr; 1209 if (!nbufs) 1210 goto out_free; 1211 ret = -EINVAL; 1212 if (!arg->nr) 1213 arg->nr = nbufs; 1214 else if (arg->nr > nbufs) 1215 goto out_free; 1216 ret = -EOVERFLOW; 1217 if (check_add_overflow(arg->nr, arg->src_off, &off)) 1218 goto out_free; 1219 if (off > nbufs) 1220 goto out_free; 1221 1222 off = arg->dst_off; 1223 i = arg->src_off; 1224 nr = arg->nr; 1225 while (nr--) { 1226 struct io_rsrc_node *dst_node, *src_node; 1227 1228 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1229 if (!src_node) { 1230 dst_node = NULL; 1231 } else { 1232 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1233 if (!dst_node) { 1234 ret = -ENOMEM; 1235 goto out_free; 1236 } 1237 1238 refcount_inc(&src_node->buf->refs); 1239 dst_node->buf = src_node->buf; 1240 } 1241 data.nodes[off++] = dst_node; 1242 i++; 1243 } 1244 1245 /* 1246 * If asked for replace, put the old table. data->nodes[] holds both 1247 * old and new nodes at this point. 1248 */ 1249 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1250 io_rsrc_data_free(ctx, &ctx->buf_table); 1251 1252 /* 1253 * ctx->buf_table must be empty now - either the contents are being 1254 * replaced and we just freed the table, or the contents are being 1255 * copied to a ring that does not have buffers yet (checked at function 1256 * entry). 1257 */ 1258 WARN_ON_ONCE(ctx->buf_table.nr); 1259 ctx->buf_table = data; 1260 return 0; 1261 1262 out_free: 1263 io_rsrc_data_free(ctx, &data); 1264 return ret; 1265 } 1266 1267 /* 1268 * Copy the registered buffers from the source ring whose file descriptor 1269 * is given in the src_fd to the current ring. This is identical to registering 1270 * the buffers with ctx, except faster as mappings already exist. 1271 * 1272 * Since the memory is already accounted once, don't account it again. 1273 */ 1274 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1275 { 1276 struct io_uring_clone_buffers buf; 1277 struct io_ring_ctx *src_ctx; 1278 bool registered_src; 1279 struct file *file; 1280 int ret; 1281 1282 if (copy_from_user(&buf, arg, sizeof(buf))) 1283 return -EFAULT; 1284 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1285 return -EINVAL; 1286 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1287 return -EBUSY; 1288 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1289 return -EINVAL; 1290 1291 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1292 file = io_uring_register_get_file(buf.src_fd, registered_src); 1293 if (IS_ERR(file)) 1294 return PTR_ERR(file); 1295 1296 src_ctx = file->private_data; 1297 if (src_ctx != ctx) { 1298 mutex_unlock(&ctx->uring_lock); 1299 lock_two_rings(ctx, src_ctx); 1300 } 1301 1302 ret = io_clone_buffers(ctx, src_ctx, &buf); 1303 1304 if (src_ctx != ctx) 1305 mutex_unlock(&src_ctx->uring_lock); 1306 1307 fput(file); 1308 return ret; 1309 } 1310 1311 void io_vec_free(struct iou_vec *iv) 1312 { 1313 if (!iv->iovec) 1314 return; 1315 kfree(iv->iovec); 1316 iv->iovec = NULL; 1317 iv->nr = 0; 1318 } 1319 1320 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1321 { 1322 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1323 struct iovec *iov; 1324 1325 iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); 1326 if (!iov) 1327 return -ENOMEM; 1328 1329 io_vec_free(iv); 1330 iv->iovec = iov; 1331 iv->nr = nr_entries; 1332 return 0; 1333 } 1334 1335 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, 1336 struct io_mapped_ubuf *imu, 1337 struct iovec *iovec, unsigned nr_iovs, 1338 struct iou_vec *vec) 1339 { 1340 unsigned long folio_size = 1 << imu->folio_shift; 1341 unsigned long folio_mask = folio_size - 1; 1342 struct bio_vec *res_bvec = vec->bvec; 1343 size_t total_len = 0; 1344 unsigned bvec_idx = 0; 1345 unsigned iov_idx; 1346 1347 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1348 size_t iov_len = iovec[iov_idx].iov_len; 1349 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; 1350 struct bio_vec *src_bvec; 1351 size_t offset; 1352 int ret; 1353 1354 ret = validate_fixed_range(buf_addr, iov_len, imu); 1355 if (unlikely(ret)) 1356 return ret; 1357 1358 if (unlikely(!iov_len)) 1359 return -EFAULT; 1360 if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) 1361 return -EOVERFLOW; 1362 1363 offset = buf_addr - imu->ubuf; 1364 /* 1365 * Only the first bvec can have non zero bv_offset, account it 1366 * here and work with full folios below. 1367 */ 1368 offset += imu->bvec[0].bv_offset; 1369 1370 src_bvec = imu->bvec + (offset >> imu->folio_shift); 1371 offset &= folio_mask; 1372 1373 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { 1374 size_t seg_size = min_t(size_t, iov_len, 1375 folio_size - offset); 1376 1377 bvec_set_page(&res_bvec[bvec_idx], 1378 src_bvec->bv_page, seg_size, offset); 1379 iov_len -= seg_size; 1380 } 1381 } 1382 if (total_len > MAX_RW_COUNT) 1383 return -EINVAL; 1384 1385 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); 1386 return 0; 1387 } 1388 1389 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, 1390 struct io_mapped_ubuf *imu) 1391 { 1392 unsigned shift = imu->folio_shift; 1393 size_t max_segs = 0; 1394 unsigned i; 1395 1396 for (i = 0; i < nr_iovs; i++) 1397 max_segs += (iov[i].iov_len >> shift) + 2; 1398 return max_segs; 1399 } 1400 1401 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, 1402 struct io_mapped_ubuf *imu, 1403 struct iovec *iovec, unsigned nr_iovs, 1404 struct iou_vec *vec) 1405 { 1406 const struct bio_vec *src_bvec = imu->bvec; 1407 struct bio_vec *res_bvec = vec->bvec; 1408 unsigned res_idx = 0; 1409 size_t total_len = 0; 1410 unsigned iov_idx; 1411 1412 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1413 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; 1414 size_t iov_len = iovec[iov_idx].iov_len; 1415 struct bvec_iter bi = { 1416 .bi_size = offset + iov_len, 1417 }; 1418 struct bio_vec bv; 1419 1420 bvec_iter_advance(src_bvec, &bi, offset); 1421 for_each_mp_bvec(bv, src_bvec, bi, bi) 1422 res_bvec[res_idx++] = bv; 1423 total_len += iov_len; 1424 } 1425 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); 1426 return 0; 1427 } 1428 1429 static int iov_kern_bvec_size(const struct iovec *iov, 1430 const struct io_mapped_ubuf *imu, 1431 unsigned int *nr_seg) 1432 { 1433 size_t offset = (size_t)(uintptr_t)iov->iov_base; 1434 const struct bio_vec *bvec = imu->bvec; 1435 int start = 0, i = 0; 1436 size_t off = 0; 1437 int ret; 1438 1439 ret = validate_fixed_range(offset, iov->iov_len, imu); 1440 if (unlikely(ret)) 1441 return ret; 1442 1443 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; 1444 off += bvec[i].bv_len, i++) { 1445 if (offset >= off && offset < off + bvec[i].bv_len) 1446 start = i; 1447 } 1448 *nr_seg = i - start; 1449 return 0; 1450 } 1451 1452 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, 1453 struct io_mapped_ubuf *imu, unsigned *nr_segs) 1454 { 1455 unsigned max_segs = 0; 1456 size_t total_len = 0; 1457 unsigned i; 1458 int ret; 1459 1460 *nr_segs = 0; 1461 for (i = 0; i < nr_iovs; i++) { 1462 if (unlikely(!iov[i].iov_len)) 1463 return -EFAULT; 1464 if (unlikely(check_add_overflow(total_len, iov[i].iov_len, 1465 &total_len))) 1466 return -EOVERFLOW; 1467 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); 1468 if (unlikely(ret)) 1469 return ret; 1470 *nr_segs += max_segs; 1471 } 1472 if (total_len > MAX_RW_COUNT) 1473 return -EINVAL; 1474 return 0; 1475 } 1476 1477 int io_import_reg_vec(int ddir, struct iov_iter *iter, 1478 struct io_kiocb *req, struct iou_vec *vec, 1479 unsigned nr_iovs, unsigned issue_flags) 1480 { 1481 struct io_rsrc_node *node; 1482 struct io_mapped_ubuf *imu; 1483 unsigned iovec_off; 1484 struct iovec *iov; 1485 unsigned nr_segs; 1486 1487 node = io_find_buf_node(req, issue_flags); 1488 if (!node) 1489 return -EFAULT; 1490 imu = node->buf; 1491 if (!(imu->dir & (1 << ddir))) 1492 return -EFAULT; 1493 1494 iovec_off = vec->nr - nr_iovs; 1495 iov = vec->iovec + iovec_off; 1496 1497 if (imu->is_kbuf) { 1498 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); 1499 1500 if (unlikely(ret)) 1501 return ret; 1502 } else { 1503 nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); 1504 } 1505 1506 if (sizeof(struct bio_vec) > sizeof(struct iovec)) { 1507 size_t bvec_bytes; 1508 1509 bvec_bytes = nr_segs * sizeof(struct bio_vec); 1510 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); 1511 nr_segs += nr_iovs; 1512 } 1513 1514 if (nr_segs > vec->nr) { 1515 struct iou_vec tmp_vec = {}; 1516 int ret; 1517 1518 ret = io_vec_realloc(&tmp_vec, nr_segs); 1519 if (ret) 1520 return ret; 1521 1522 iovec_off = tmp_vec.nr - nr_iovs; 1523 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); 1524 io_vec_free(vec); 1525 1526 *vec = tmp_vec; 1527 iov = vec->iovec + iovec_off; 1528 req->flags |= REQ_F_NEED_CLEANUP; 1529 } 1530 1531 if (imu->is_kbuf) 1532 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1533 1534 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1535 } 1536 1537 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, 1538 const struct iovec __user *uvec, size_t uvec_segs) 1539 { 1540 struct iovec *iov; 1541 int iovec_off, ret; 1542 void *res; 1543 1544 if (uvec_segs > iv->nr) { 1545 ret = io_vec_realloc(iv, uvec_segs); 1546 if (ret) 1547 return ret; 1548 req->flags |= REQ_F_NEED_CLEANUP; 1549 } 1550 1551 /* pad iovec to the right */ 1552 iovec_off = iv->nr - uvec_segs; 1553 iov = iv->iovec + iovec_off; 1554 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, 1555 io_is_compat(req->ctx)); 1556 if (IS_ERR(res)) 1557 return PTR_ERR(res); 1558 1559 req->flags |= REQ_F_IMPORT_BUFFER; 1560 return 0; 1561 } 1562