1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 #include <linux/io_uring/cmd.h> 13 14 #include <uapi/linux/io_uring.h> 15 16 #include "io_uring.h" 17 #include "openclose.h" 18 #include "rsrc.h" 19 #include "memmap.h" 20 #include "register.h" 21 22 struct io_rsrc_update { 23 struct file *file; 24 u64 arg; 25 u32 nr_args; 26 u32 offset; 27 }; 28 29 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 30 struct iovec *iov, struct page **last_hpage); 31 32 /* only define max */ 33 #define IORING_MAX_FIXED_FILES (1U << 20) 34 #define IORING_MAX_REG_BUFFERS (1U << 14) 35 36 #define IO_CACHED_BVECS_SEGS 32 37 38 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 39 { 40 unsigned long page_limit, cur_pages, new_pages; 41 42 if (!nr_pages) 43 return 0; 44 45 /* Don't allow more pages than we can safely lock */ 46 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 47 48 cur_pages = atomic_long_read(&user->locked_vm); 49 do { 50 new_pages = cur_pages + nr_pages; 51 if (new_pages > page_limit) 52 return -ENOMEM; 53 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 54 &cur_pages, new_pages)); 55 return 0; 56 } 57 58 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 59 { 60 if (ctx->user) 61 __io_unaccount_mem(ctx->user, nr_pages); 62 63 if (ctx->mm_account) 64 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 65 } 66 67 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 68 { 69 int ret; 70 71 if (ctx->user) { 72 ret = __io_account_mem(ctx->user, nr_pages); 73 if (ret) 74 return ret; 75 } 76 77 if (ctx->mm_account) 78 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 79 80 return 0; 81 } 82 83 int io_buffer_validate(struct iovec *iov) 84 { 85 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 86 87 /* 88 * Don't impose further limits on the size and buffer 89 * constraints here, we'll -EINVAL later when IO is 90 * submitted if they are wrong. 91 */ 92 if (!iov->iov_base) 93 return iov->iov_len ? -EFAULT : 0; 94 if (!iov->iov_len) 95 return -EFAULT; 96 97 /* arbitrary limit, but we need something */ 98 if (iov->iov_len > SZ_1G) 99 return -EFAULT; 100 101 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 102 return -EOVERFLOW; 103 104 return 0; 105 } 106 107 static void io_release_ubuf(void *priv) 108 { 109 struct io_mapped_ubuf *imu = priv; 110 unsigned int i; 111 112 for (i = 0; i < imu->nr_bvecs; i++) 113 unpin_user_page(imu->bvec[i].bv_page); 114 } 115 116 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 117 int nr_bvecs) 118 { 119 if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 120 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 121 return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), 122 GFP_KERNEL); 123 } 124 125 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 126 { 127 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 128 io_cache_free(&ctx->imu_cache, imu); 129 else 130 kvfree(imu); 131 } 132 133 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 134 { 135 if (!refcount_dec_and_test(&imu->refs)) 136 return; 137 138 if (imu->acct_pages) 139 io_unaccount_mem(ctx, imu->acct_pages); 140 imu->release(imu->priv); 141 io_free_imu(ctx, imu); 142 } 143 144 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 145 { 146 struct io_rsrc_node *node; 147 148 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 149 if (node) { 150 node->type = type; 151 node->refs = 1; 152 node->tag = 0; 153 node->file_ptr = 0; 154 } 155 return node; 156 } 157 158 bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 159 { 160 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 161 IO_CACHED_BVECS_SEGS); 162 const int node_size = sizeof(struct io_rsrc_node); 163 bool ret; 164 165 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 166 node_size, 0); 167 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 168 imu_cache_size, 0); 169 return ret; 170 } 171 172 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 173 { 174 io_alloc_cache_free(&ctx->node_cache, kfree); 175 io_alloc_cache_free(&ctx->imu_cache, kfree); 176 } 177 178 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 179 struct io_rsrc_data *data) 180 { 181 if (!data->nr) 182 return; 183 while (data->nr--) { 184 if (data->nodes[data->nr]) 185 io_put_rsrc_node(ctx, data->nodes[data->nr]); 186 } 187 kvfree(data->nodes); 188 data->nodes = NULL; 189 data->nr = 0; 190 } 191 192 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 193 { 194 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), 195 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 196 if (data->nodes) { 197 data->nr = nr; 198 return 0; 199 } 200 return -ENOMEM; 201 } 202 203 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 204 struct io_uring_rsrc_update2 *up, 205 unsigned nr_args) 206 { 207 u64 __user *tags = u64_to_user_ptr(up->tags); 208 __s32 __user *fds = u64_to_user_ptr(up->data); 209 int fd, i, err = 0; 210 unsigned int done; 211 212 if (!ctx->file_table.data.nr) 213 return -ENXIO; 214 if (up->offset + nr_args > ctx->file_table.data.nr) 215 return -EINVAL; 216 217 for (done = 0; done < nr_args; done++) { 218 u64 tag = 0; 219 220 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 221 copy_from_user(&fd, &fds[done], sizeof(fd))) { 222 err = -EFAULT; 223 break; 224 } 225 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 226 err = -EINVAL; 227 break; 228 } 229 if (fd == IORING_REGISTER_FILES_SKIP) 230 continue; 231 232 i = up->offset + done; 233 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 234 io_file_bitmap_clear(&ctx->file_table, i); 235 236 if (fd != -1) { 237 struct file *file = fget(fd); 238 struct io_rsrc_node *node; 239 240 if (!file) { 241 err = -EBADF; 242 break; 243 } 244 /* 245 * Don't allow io_uring instances to be registered. 246 */ 247 if (io_is_uring_fops(file)) { 248 fput(file); 249 err = -EBADF; 250 break; 251 } 252 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 253 if (!node) { 254 err = -ENOMEM; 255 fput(file); 256 break; 257 } 258 ctx->file_table.data.nodes[i] = node; 259 if (tag) 260 node->tag = tag; 261 io_fixed_file_set(node, file); 262 io_file_bitmap_set(&ctx->file_table, i); 263 } 264 } 265 return done ? done : err; 266 } 267 268 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 269 struct io_uring_rsrc_update2 *up, 270 unsigned int nr_args) 271 { 272 u64 __user *tags = u64_to_user_ptr(up->tags); 273 struct iovec fast_iov, *iov; 274 struct page *last_hpage = NULL; 275 struct iovec __user *uvec; 276 u64 user_data = up->data; 277 __u32 done; 278 int i, err; 279 280 if (!ctx->buf_table.nr) 281 return -ENXIO; 282 if (up->offset + nr_args > ctx->buf_table.nr) 283 return -EINVAL; 284 285 for (done = 0; done < nr_args; done++) { 286 struct io_rsrc_node *node; 287 u64 tag = 0; 288 289 uvec = u64_to_user_ptr(user_data); 290 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 291 if (IS_ERR(iov)) { 292 err = PTR_ERR(iov); 293 break; 294 } 295 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 296 err = -EFAULT; 297 break; 298 } 299 err = io_buffer_validate(iov); 300 if (err) 301 break; 302 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 303 if (IS_ERR(node)) { 304 err = PTR_ERR(node); 305 break; 306 } 307 if (tag) { 308 if (!node) { 309 err = -EINVAL; 310 break; 311 } 312 node->tag = tag; 313 } 314 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 315 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 316 ctx->buf_table.nodes[i] = node; 317 if (ctx->compat) 318 user_data += sizeof(struct compat_iovec); 319 else 320 user_data += sizeof(struct iovec); 321 } 322 return done ? done : err; 323 } 324 325 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 326 struct io_uring_rsrc_update2 *up, 327 unsigned nr_args) 328 { 329 __u32 tmp; 330 331 lockdep_assert_held(&ctx->uring_lock); 332 333 if (check_add_overflow(up->offset, nr_args, &tmp)) 334 return -EOVERFLOW; 335 336 switch (type) { 337 case IORING_RSRC_FILE: 338 return __io_sqe_files_update(ctx, up, nr_args); 339 case IORING_RSRC_BUFFER: 340 return __io_sqe_buffers_update(ctx, up, nr_args); 341 } 342 return -EINVAL; 343 } 344 345 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 346 unsigned nr_args) 347 { 348 struct io_uring_rsrc_update2 up; 349 350 if (!nr_args) 351 return -EINVAL; 352 memset(&up, 0, sizeof(up)); 353 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 354 return -EFAULT; 355 if (up.resv || up.resv2) 356 return -EINVAL; 357 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 358 } 359 360 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 361 unsigned size, unsigned type) 362 { 363 struct io_uring_rsrc_update2 up; 364 365 if (size != sizeof(up)) 366 return -EINVAL; 367 if (copy_from_user(&up, arg, sizeof(up))) 368 return -EFAULT; 369 if (!up.nr || up.resv || up.resv2) 370 return -EINVAL; 371 return __io_register_rsrc_update(ctx, type, &up, up.nr); 372 } 373 374 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 375 unsigned int size, unsigned int type) 376 { 377 struct io_uring_rsrc_register rr; 378 379 /* keep it extendible */ 380 if (size != sizeof(rr)) 381 return -EINVAL; 382 383 memset(&rr, 0, sizeof(rr)); 384 if (copy_from_user(&rr, arg, size)) 385 return -EFAULT; 386 if (!rr.nr || rr.resv2) 387 return -EINVAL; 388 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 389 return -EINVAL; 390 391 switch (type) { 392 case IORING_RSRC_FILE: 393 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 394 break; 395 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 396 rr.nr, u64_to_user_ptr(rr.tags)); 397 case IORING_RSRC_BUFFER: 398 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 399 break; 400 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 401 rr.nr, u64_to_user_ptr(rr.tags)); 402 } 403 return -EINVAL; 404 } 405 406 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 407 { 408 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 409 410 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 411 return -EINVAL; 412 if (sqe->rw_flags || sqe->splice_fd_in) 413 return -EINVAL; 414 415 up->offset = READ_ONCE(sqe->off); 416 up->nr_args = READ_ONCE(sqe->len); 417 if (!up->nr_args) 418 return -EINVAL; 419 up->arg = READ_ONCE(sqe->addr); 420 return 0; 421 } 422 423 static int io_files_update_with_index_alloc(struct io_kiocb *req, 424 unsigned int issue_flags) 425 { 426 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 427 __s32 __user *fds = u64_to_user_ptr(up->arg); 428 unsigned int done; 429 struct file *file; 430 int ret, fd; 431 432 if (!req->ctx->file_table.data.nr) 433 return -ENXIO; 434 435 for (done = 0; done < up->nr_args; done++) { 436 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 437 ret = -EFAULT; 438 break; 439 } 440 441 file = fget(fd); 442 if (!file) { 443 ret = -EBADF; 444 break; 445 } 446 ret = io_fixed_fd_install(req, issue_flags, file, 447 IORING_FILE_INDEX_ALLOC); 448 if (ret < 0) 449 break; 450 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 451 __io_close_fixed(req->ctx, issue_flags, ret); 452 ret = -EFAULT; 453 break; 454 } 455 } 456 457 if (done) 458 return done; 459 return ret; 460 } 461 462 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 463 { 464 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 465 struct io_ring_ctx *ctx = req->ctx; 466 struct io_uring_rsrc_update2 up2; 467 int ret; 468 469 up2.offset = up->offset; 470 up2.data = up->arg; 471 up2.nr = 0; 472 up2.tags = 0; 473 up2.resv = 0; 474 up2.resv2 = 0; 475 476 if (up->offset == IORING_FILE_INDEX_ALLOC) { 477 ret = io_files_update_with_index_alloc(req, issue_flags); 478 } else { 479 io_ring_submit_lock(ctx, issue_flags); 480 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 481 &up2, up->nr_args); 482 io_ring_submit_unlock(ctx, issue_flags); 483 } 484 485 if (ret < 0) 486 req_set_fail(req); 487 io_req_set_res(req, ret, 0); 488 return IOU_OK; 489 } 490 491 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 492 { 493 if (node->tag) 494 io_post_aux_cqe(ctx, node->tag, 0, 0); 495 496 switch (node->type) { 497 case IORING_RSRC_FILE: 498 fput(io_slot_file(node)); 499 break; 500 case IORING_RSRC_BUFFER: 501 io_buffer_unmap(ctx, node->buf); 502 break; 503 default: 504 WARN_ON_ONCE(1); 505 break; 506 } 507 508 io_cache_free(&ctx->node_cache, node); 509 } 510 511 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 512 { 513 if (!ctx->file_table.data.nr) 514 return -ENXIO; 515 516 io_free_file_tables(ctx, &ctx->file_table); 517 io_file_table_set_alloc_range(ctx, 0, 0); 518 return 0; 519 } 520 521 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 522 unsigned nr_args, u64 __user *tags) 523 { 524 __s32 __user *fds = (__s32 __user *) arg; 525 struct file *file; 526 int fd, ret; 527 unsigned i; 528 529 if (ctx->file_table.data.nr) 530 return -EBUSY; 531 if (!nr_args) 532 return -EINVAL; 533 if (nr_args > IORING_MAX_FIXED_FILES) 534 return -EMFILE; 535 if (nr_args > rlimit(RLIMIT_NOFILE)) 536 return -EMFILE; 537 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 538 return -ENOMEM; 539 540 for (i = 0; i < nr_args; i++) { 541 struct io_rsrc_node *node; 542 u64 tag = 0; 543 544 ret = -EFAULT; 545 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 546 goto fail; 547 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 548 goto fail; 549 /* allow sparse sets */ 550 if (!fds || fd == -1) { 551 ret = -EINVAL; 552 if (tag) 553 goto fail; 554 continue; 555 } 556 557 file = fget(fd); 558 ret = -EBADF; 559 if (unlikely(!file)) 560 goto fail; 561 562 /* 563 * Don't allow io_uring instances to be registered. 564 */ 565 if (io_is_uring_fops(file)) { 566 fput(file); 567 goto fail; 568 } 569 ret = -ENOMEM; 570 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 571 if (!node) { 572 fput(file); 573 goto fail; 574 } 575 if (tag) 576 node->tag = tag; 577 ctx->file_table.data.nodes[i] = node; 578 io_fixed_file_set(node, file); 579 io_file_bitmap_set(&ctx->file_table, i); 580 } 581 582 /* default it to the whole table */ 583 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 584 return 0; 585 fail: 586 io_sqe_files_unregister(ctx); 587 return ret; 588 } 589 590 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 591 { 592 if (!ctx->buf_table.nr) 593 return -ENXIO; 594 io_rsrc_data_free(ctx, &ctx->buf_table); 595 return 0; 596 } 597 598 /* 599 * Not super efficient, but this is just a registration time. And we do cache 600 * the last compound head, so generally we'll only do a full search if we don't 601 * match that one. 602 * 603 * We check if the given compound head page has already been accounted, to 604 * avoid double accounting it. This allows us to account the full size of the 605 * page, not just the constituent pages of a huge page. 606 */ 607 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 608 int nr_pages, struct page *hpage) 609 { 610 int i, j; 611 612 /* check current page array */ 613 for (i = 0; i < nr_pages; i++) { 614 if (!PageCompound(pages[i])) 615 continue; 616 if (compound_head(pages[i]) == hpage) 617 return true; 618 } 619 620 /* check previously registered pages */ 621 for (i = 0; i < ctx->buf_table.nr; i++) { 622 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 623 struct io_mapped_ubuf *imu; 624 625 if (!node) 626 continue; 627 imu = node->buf; 628 for (j = 0; j < imu->nr_bvecs; j++) { 629 if (!PageCompound(imu->bvec[j].bv_page)) 630 continue; 631 if (compound_head(imu->bvec[j].bv_page) == hpage) 632 return true; 633 } 634 } 635 636 return false; 637 } 638 639 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 640 int nr_pages, struct io_mapped_ubuf *imu, 641 struct page **last_hpage) 642 { 643 int i, ret; 644 645 imu->acct_pages = 0; 646 for (i = 0; i < nr_pages; i++) { 647 if (!PageCompound(pages[i])) { 648 imu->acct_pages++; 649 } else { 650 struct page *hpage; 651 652 hpage = compound_head(pages[i]); 653 if (hpage == *last_hpage) 654 continue; 655 *last_hpage = hpage; 656 if (headpage_already_acct(ctx, pages, i, hpage)) 657 continue; 658 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 659 } 660 } 661 662 if (!imu->acct_pages) 663 return 0; 664 665 ret = io_account_mem(ctx, imu->acct_pages); 666 if (ret) 667 imu->acct_pages = 0; 668 return ret; 669 } 670 671 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 672 struct io_imu_folio_data *data) 673 { 674 struct page **page_array = *pages, **new_array = NULL; 675 int nr_pages_left = *nr_pages, i, j; 676 int nr_folios = data->nr_folios; 677 678 /* Store head pages only*/ 679 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), 680 GFP_KERNEL); 681 if (!new_array) 682 return false; 683 684 new_array[0] = compound_head(page_array[0]); 685 /* 686 * The pages are bound to the folio, it doesn't 687 * actually unpin them but drops all but one reference, 688 * which is usually put down by io_buffer_unmap(). 689 * Note, needs a better helper. 690 */ 691 if (data->nr_pages_head > 1) 692 unpin_user_pages(&page_array[1], data->nr_pages_head - 1); 693 694 j = data->nr_pages_head; 695 nr_pages_left -= data->nr_pages_head; 696 for (i = 1; i < nr_folios; i++) { 697 unsigned int nr_unpin; 698 699 new_array[i] = page_array[j]; 700 nr_unpin = min_t(unsigned int, nr_pages_left - 1, 701 data->nr_pages_mid - 1); 702 if (nr_unpin) 703 unpin_user_pages(&page_array[j+1], nr_unpin); 704 j += data->nr_pages_mid; 705 nr_pages_left -= data->nr_pages_mid; 706 } 707 kvfree(page_array); 708 *pages = new_array; 709 *nr_pages = nr_folios; 710 return true; 711 } 712 713 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 714 struct io_imu_folio_data *data) 715 { 716 struct folio *folio = page_folio(page_array[0]); 717 unsigned int count = 1, nr_folios = 1; 718 int i; 719 720 data->nr_pages_mid = folio_nr_pages(folio); 721 data->folio_shift = folio_shift(folio); 722 723 /* 724 * Check if pages are contiguous inside a folio, and all folios have 725 * the same page count except for the head and tail. 726 */ 727 for (i = 1; i < nr_pages; i++) { 728 if (page_folio(page_array[i]) == folio && 729 page_array[i] == page_array[i-1] + 1) { 730 count++; 731 continue; 732 } 733 734 if (nr_folios == 1) { 735 if (folio_page_idx(folio, page_array[i-1]) != 736 data->nr_pages_mid - 1) 737 return false; 738 739 data->nr_pages_head = count; 740 } else if (count != data->nr_pages_mid) { 741 return false; 742 } 743 744 folio = page_folio(page_array[i]); 745 if (folio_size(folio) != (1UL << data->folio_shift) || 746 folio_page_idx(folio, page_array[i]) != 0) 747 return false; 748 749 count = 1; 750 nr_folios++; 751 } 752 if (nr_folios == 1) 753 data->nr_pages_head = count; 754 755 data->nr_folios = nr_folios; 756 return true; 757 } 758 759 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 760 struct iovec *iov, 761 struct page **last_hpage) 762 { 763 struct io_mapped_ubuf *imu = NULL; 764 struct page **pages = NULL; 765 struct io_rsrc_node *node; 766 unsigned long off; 767 size_t size; 768 int ret, nr_pages, i; 769 struct io_imu_folio_data data; 770 bool coalesced = false; 771 772 if (!iov->iov_base) 773 return NULL; 774 775 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 776 if (!node) 777 return ERR_PTR(-ENOMEM); 778 779 ret = -ENOMEM; 780 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 781 &nr_pages); 782 if (IS_ERR(pages)) { 783 ret = PTR_ERR(pages); 784 pages = NULL; 785 goto done; 786 } 787 788 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 789 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 790 if (data.nr_pages_mid != 1) 791 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 792 } 793 794 imu = io_alloc_imu(ctx, nr_pages); 795 if (!imu) 796 goto done; 797 798 imu->nr_bvecs = nr_pages; 799 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 800 if (ret) { 801 unpin_user_pages(pages, nr_pages); 802 goto done; 803 } 804 805 size = iov->iov_len; 806 /* store original address for later verification */ 807 imu->ubuf = (unsigned long) iov->iov_base; 808 imu->len = iov->iov_len; 809 imu->folio_shift = PAGE_SHIFT; 810 imu->release = io_release_ubuf; 811 imu->priv = imu; 812 imu->is_kbuf = false; 813 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 814 if (coalesced) 815 imu->folio_shift = data.folio_shift; 816 refcount_set(&imu->refs, 1); 817 off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); 818 node->buf = imu; 819 ret = 0; 820 821 for (i = 0; i < nr_pages; i++) { 822 size_t vec_len; 823 824 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 825 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 826 off = 0; 827 size -= vec_len; 828 } 829 done: 830 if (ret) { 831 if (imu) 832 io_free_imu(ctx, imu); 833 io_cache_free(&ctx->node_cache, node); 834 node = ERR_PTR(ret); 835 } 836 kvfree(pages); 837 return node; 838 } 839 840 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 841 unsigned int nr_args, u64 __user *tags) 842 { 843 struct page *last_hpage = NULL; 844 struct io_rsrc_data data; 845 struct iovec fast_iov, *iov = &fast_iov; 846 const struct iovec __user *uvec; 847 int i, ret; 848 849 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 850 851 if (ctx->buf_table.nr) 852 return -EBUSY; 853 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 854 return -EINVAL; 855 ret = io_rsrc_data_alloc(&data, nr_args); 856 if (ret) 857 return ret; 858 859 if (!arg) 860 memset(iov, 0, sizeof(*iov)); 861 862 for (i = 0; i < nr_args; i++) { 863 struct io_rsrc_node *node; 864 u64 tag = 0; 865 866 if (arg) { 867 uvec = (struct iovec __user *) arg; 868 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 869 if (IS_ERR(iov)) { 870 ret = PTR_ERR(iov); 871 break; 872 } 873 ret = io_buffer_validate(iov); 874 if (ret) 875 break; 876 if (ctx->compat) 877 arg += sizeof(struct compat_iovec); 878 else 879 arg += sizeof(struct iovec); 880 } 881 882 if (tags) { 883 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 884 ret = -EFAULT; 885 break; 886 } 887 } 888 889 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 890 if (IS_ERR(node)) { 891 ret = PTR_ERR(node); 892 break; 893 } 894 if (tag) { 895 if (!node) { 896 ret = -EINVAL; 897 break; 898 } 899 node->tag = tag; 900 } 901 data.nodes[i] = node; 902 } 903 904 ctx->buf_table = data; 905 if (ret) 906 io_sqe_buffers_unregister(ctx); 907 return ret; 908 } 909 910 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 911 void (*release)(void *), unsigned int index, 912 unsigned int issue_flags) 913 { 914 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 915 struct io_rsrc_data *data = &ctx->buf_table; 916 struct req_iterator rq_iter; 917 struct io_mapped_ubuf *imu; 918 struct io_rsrc_node *node; 919 struct bio_vec bv, *bvec; 920 u16 nr_bvecs; 921 int ret = 0; 922 923 io_ring_submit_lock(ctx, issue_flags); 924 if (index >= data->nr) { 925 ret = -EINVAL; 926 goto unlock; 927 } 928 index = array_index_nospec(index, data->nr); 929 930 if (data->nodes[index]) { 931 ret = -EBUSY; 932 goto unlock; 933 } 934 935 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 936 if (!node) { 937 ret = -ENOMEM; 938 goto unlock; 939 } 940 941 nr_bvecs = blk_rq_nr_phys_segments(rq); 942 imu = io_alloc_imu(ctx, nr_bvecs); 943 if (!imu) { 944 kfree(node); 945 ret = -ENOMEM; 946 goto unlock; 947 } 948 949 imu->ubuf = 0; 950 imu->len = blk_rq_bytes(rq); 951 imu->acct_pages = 0; 952 imu->folio_shift = PAGE_SHIFT; 953 imu->nr_bvecs = nr_bvecs; 954 refcount_set(&imu->refs, 1); 955 imu->release = release; 956 imu->priv = rq; 957 imu->is_kbuf = true; 958 imu->dir = 1 << rq_data_dir(rq); 959 960 bvec = imu->bvec; 961 rq_for_each_bvec(bv, rq, rq_iter) 962 *bvec++ = bv; 963 964 node->buf = imu; 965 data->nodes[index] = node; 966 unlock: 967 io_ring_submit_unlock(ctx, issue_flags); 968 return ret; 969 } 970 EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 971 972 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 973 unsigned int issue_flags) 974 { 975 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 976 struct io_rsrc_data *data = &ctx->buf_table; 977 struct io_rsrc_node *node; 978 int ret = 0; 979 980 io_ring_submit_lock(ctx, issue_flags); 981 if (index >= data->nr) { 982 ret = -EINVAL; 983 goto unlock; 984 } 985 index = array_index_nospec(index, data->nr); 986 987 node = data->nodes[index]; 988 if (!node) { 989 ret = -EINVAL; 990 goto unlock; 991 } 992 if (!node->buf->is_kbuf) { 993 ret = -EBUSY; 994 goto unlock; 995 } 996 997 io_put_rsrc_node(ctx, node); 998 data->nodes[index] = NULL; 999 unlock: 1000 io_ring_submit_unlock(ctx, issue_flags); 1001 return ret; 1002 } 1003 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 1004 1005 static int validate_fixed_range(u64 buf_addr, size_t len, 1006 const struct io_mapped_ubuf *imu) 1007 { 1008 u64 buf_end; 1009 1010 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1011 return -EFAULT; 1012 /* not inside the mapped region */ 1013 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1014 return -EFAULT; 1015 if (unlikely(len > MAX_RW_COUNT)) 1016 return -EFAULT; 1017 return 0; 1018 } 1019 1020 static int io_import_fixed(int ddir, struct iov_iter *iter, 1021 struct io_mapped_ubuf *imu, 1022 u64 buf_addr, size_t len) 1023 { 1024 size_t offset; 1025 int ret; 1026 1027 if (WARN_ON_ONCE(!imu)) 1028 return -EFAULT; 1029 ret = validate_fixed_range(buf_addr, len, imu); 1030 if (unlikely(ret)) 1031 return ret; 1032 if (!(imu->dir & (1 << ddir))) 1033 return -EFAULT; 1034 1035 /* 1036 * Might not be a start of buffer, set size appropriately 1037 * and advance us to the beginning. 1038 */ 1039 offset = buf_addr - imu->ubuf; 1040 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 1041 1042 if (offset) { 1043 /* 1044 * Don't use iov_iter_advance() here, as it's really slow for 1045 * using the latter parts of a big fixed buffer - it iterates 1046 * over each segment manually. We can cheat a bit here for user 1047 * registered nodes, because we know that: 1048 * 1049 * 1) it's a BVEC iter, we set it up 1050 * 2) all bvecs are the same in size, except potentially the 1051 * first and last bvec 1052 * 1053 * So just find our index, and adjust the iterator afterwards. 1054 * If the offset is within the first bvec (or the whole first 1055 * bvec, just use iov_iter_advance(). This makes it easier 1056 * since we can just skip the first segment, which may not 1057 * be folio_size aligned. 1058 */ 1059 const struct bio_vec *bvec = imu->bvec; 1060 1061 /* 1062 * Kernel buffer bvecs, on the other hand, don't necessarily 1063 * have the size property of user registered ones, so we have 1064 * to use the slow iter advance. 1065 */ 1066 if (offset < bvec->bv_len) { 1067 iter->count -= offset; 1068 iter->iov_offset = offset; 1069 } else if (imu->is_kbuf) { 1070 iov_iter_advance(iter, offset); 1071 } else { 1072 unsigned long seg_skip; 1073 1074 /* skip first vec */ 1075 offset -= bvec->bv_len; 1076 seg_skip = 1 + (offset >> imu->folio_shift); 1077 1078 iter->bvec += seg_skip; 1079 iter->nr_segs -= seg_skip; 1080 iter->count -= bvec->bv_len + offset; 1081 iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); 1082 } 1083 } 1084 1085 return 0; 1086 } 1087 1088 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 1089 unsigned issue_flags) 1090 { 1091 struct io_ring_ctx *ctx = req->ctx; 1092 struct io_rsrc_node *node; 1093 1094 if (req->flags & REQ_F_BUF_NODE) 1095 return req->buf_node; 1096 1097 io_ring_submit_lock(ctx, issue_flags); 1098 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 1099 if (node) 1100 io_req_assign_buf_node(req, node); 1101 io_ring_submit_unlock(ctx, issue_flags); 1102 return node; 1103 } 1104 1105 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 1106 u64 buf_addr, size_t len, int ddir, 1107 unsigned issue_flags) 1108 { 1109 struct io_rsrc_node *node; 1110 1111 node = io_find_buf_node(req, issue_flags); 1112 if (!node) 1113 return -EFAULT; 1114 return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1115 } 1116 1117 /* Lock two rings at once. The rings must be different! */ 1118 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 1119 { 1120 if (ctx1 > ctx2) 1121 swap(ctx1, ctx2); 1122 mutex_lock(&ctx1->uring_lock); 1123 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 1124 } 1125 1126 /* Both rings are locked by the caller. */ 1127 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 1128 struct io_uring_clone_buffers *arg) 1129 { 1130 struct io_rsrc_data data; 1131 int i, ret, off, nr; 1132 unsigned int nbufs; 1133 1134 lockdep_assert_held(&ctx->uring_lock); 1135 lockdep_assert_held(&src_ctx->uring_lock); 1136 1137 /* 1138 * Accounting state is shared between the two rings; that only works if 1139 * both rings are accounted towards the same counters. 1140 */ 1141 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 1142 return -EINVAL; 1143 1144 /* if offsets are given, must have nr specified too */ 1145 if (!arg->nr && (arg->dst_off || arg->src_off)) 1146 return -EINVAL; 1147 /* not allowed unless REPLACE is set */ 1148 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 1149 return -EBUSY; 1150 1151 nbufs = src_ctx->buf_table.nr; 1152 if (!arg->nr) 1153 arg->nr = nbufs; 1154 else if (arg->nr > nbufs) 1155 return -EINVAL; 1156 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1157 return -EINVAL; 1158 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1159 return -EOVERFLOW; 1160 1161 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 1162 if (ret) 1163 return ret; 1164 1165 /* Fill entries in data from dst that won't overlap with src */ 1166 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1167 struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; 1168 1169 if (src_node) { 1170 data.nodes[i] = src_node; 1171 src_node->refs++; 1172 } 1173 } 1174 1175 ret = -ENXIO; 1176 nbufs = src_ctx->buf_table.nr; 1177 if (!nbufs) 1178 goto out_free; 1179 ret = -EINVAL; 1180 if (!arg->nr) 1181 arg->nr = nbufs; 1182 else if (arg->nr > nbufs) 1183 goto out_free; 1184 ret = -EOVERFLOW; 1185 if (check_add_overflow(arg->nr, arg->src_off, &off)) 1186 goto out_free; 1187 if (off > nbufs) 1188 goto out_free; 1189 1190 off = arg->dst_off; 1191 i = arg->src_off; 1192 nr = arg->nr; 1193 while (nr--) { 1194 struct io_rsrc_node *dst_node, *src_node; 1195 1196 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1197 if (!src_node) { 1198 dst_node = NULL; 1199 } else { 1200 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1201 if (!dst_node) { 1202 ret = -ENOMEM; 1203 goto out_free; 1204 } 1205 1206 refcount_inc(&src_node->buf->refs); 1207 dst_node->buf = src_node->buf; 1208 } 1209 data.nodes[off++] = dst_node; 1210 i++; 1211 } 1212 1213 /* 1214 * If asked for replace, put the old table. data->nodes[] holds both 1215 * old and new nodes at this point. 1216 */ 1217 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1218 io_rsrc_data_free(ctx, &ctx->buf_table); 1219 1220 /* 1221 * ctx->buf_table must be empty now - either the contents are being 1222 * replaced and we just freed the table, or the contents are being 1223 * copied to a ring that does not have buffers yet (checked at function 1224 * entry). 1225 */ 1226 WARN_ON_ONCE(ctx->buf_table.nr); 1227 ctx->buf_table = data; 1228 return 0; 1229 1230 out_free: 1231 io_rsrc_data_free(ctx, &data); 1232 return ret; 1233 } 1234 1235 /* 1236 * Copy the registered buffers from the source ring whose file descriptor 1237 * is given in the src_fd to the current ring. This is identical to registering 1238 * the buffers with ctx, except faster as mappings already exist. 1239 * 1240 * Since the memory is already accounted once, don't account it again. 1241 */ 1242 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1243 { 1244 struct io_uring_clone_buffers buf; 1245 struct io_ring_ctx *src_ctx; 1246 bool registered_src; 1247 struct file *file; 1248 int ret; 1249 1250 if (copy_from_user(&buf, arg, sizeof(buf))) 1251 return -EFAULT; 1252 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1253 return -EINVAL; 1254 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1255 return -EBUSY; 1256 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1257 return -EINVAL; 1258 1259 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1260 file = io_uring_register_get_file(buf.src_fd, registered_src); 1261 if (IS_ERR(file)) 1262 return PTR_ERR(file); 1263 1264 src_ctx = file->private_data; 1265 if (src_ctx != ctx) { 1266 mutex_unlock(&ctx->uring_lock); 1267 lock_two_rings(ctx, src_ctx); 1268 } 1269 1270 ret = io_clone_buffers(ctx, src_ctx, &buf); 1271 1272 if (src_ctx != ctx) 1273 mutex_unlock(&src_ctx->uring_lock); 1274 1275 fput(file); 1276 return ret; 1277 } 1278 1279 void io_vec_free(struct iou_vec *iv) 1280 { 1281 if (!iv->iovec) 1282 return; 1283 kfree(iv->iovec); 1284 iv->iovec = NULL; 1285 iv->nr = 0; 1286 } 1287 1288 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1289 { 1290 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1291 struct iovec *iov; 1292 1293 iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); 1294 if (!iov) 1295 return -ENOMEM; 1296 1297 io_vec_free(iv); 1298 iv->iovec = iov; 1299 iv->nr = nr_entries; 1300 return 0; 1301 } 1302 1303 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, 1304 struct io_mapped_ubuf *imu, 1305 struct iovec *iovec, unsigned nr_iovs, 1306 struct iou_vec *vec) 1307 { 1308 unsigned long folio_size = 1 << imu->folio_shift; 1309 unsigned long folio_mask = folio_size - 1; 1310 u64 folio_addr = imu->ubuf & ~folio_mask; 1311 struct bio_vec *res_bvec = vec->bvec; 1312 size_t total_len = 0; 1313 unsigned bvec_idx = 0; 1314 unsigned iov_idx; 1315 1316 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1317 size_t iov_len = iovec[iov_idx].iov_len; 1318 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; 1319 struct bio_vec *src_bvec; 1320 size_t offset; 1321 int ret; 1322 1323 ret = validate_fixed_range(buf_addr, iov_len, imu); 1324 if (unlikely(ret)) 1325 return ret; 1326 1327 if (unlikely(!iov_len)) 1328 return -EFAULT; 1329 if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) 1330 return -EOVERFLOW; 1331 1332 /* by using folio address it also accounts for bvec offset */ 1333 offset = buf_addr - folio_addr; 1334 src_bvec = imu->bvec + (offset >> imu->folio_shift); 1335 offset &= folio_mask; 1336 1337 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { 1338 size_t seg_size = min_t(size_t, iov_len, 1339 folio_size - offset); 1340 1341 bvec_set_page(&res_bvec[bvec_idx], 1342 src_bvec->bv_page, seg_size, offset); 1343 iov_len -= seg_size; 1344 } 1345 } 1346 if (total_len > MAX_RW_COUNT) 1347 return -EINVAL; 1348 1349 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); 1350 return 0; 1351 } 1352 1353 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, 1354 struct io_mapped_ubuf *imu) 1355 { 1356 unsigned shift = imu->folio_shift; 1357 size_t max_segs = 0; 1358 unsigned i; 1359 1360 for (i = 0; i < nr_iovs; i++) 1361 max_segs += (iov[i].iov_len >> shift) + 2; 1362 return max_segs; 1363 } 1364 1365 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, 1366 struct io_mapped_ubuf *imu, 1367 struct iovec *iovec, unsigned nr_iovs, 1368 struct iou_vec *vec) 1369 { 1370 const struct bio_vec *src_bvec = imu->bvec; 1371 struct bio_vec *res_bvec = vec->bvec; 1372 unsigned res_idx = 0; 1373 size_t total_len = 0; 1374 unsigned iov_idx; 1375 1376 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1377 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; 1378 size_t iov_len = iovec[iov_idx].iov_len; 1379 struct bvec_iter bi = { 1380 .bi_size = offset + iov_len, 1381 }; 1382 struct bio_vec bv; 1383 1384 bvec_iter_advance(src_bvec, &bi, offset); 1385 for_each_mp_bvec(bv, src_bvec, bi, bi) 1386 res_bvec[res_idx++] = bv; 1387 total_len += iov_len; 1388 } 1389 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); 1390 return 0; 1391 } 1392 1393 static int iov_kern_bvec_size(const struct iovec *iov, 1394 const struct io_mapped_ubuf *imu, 1395 unsigned int *nr_seg) 1396 { 1397 size_t offset = (size_t)(uintptr_t)iov->iov_base; 1398 const struct bio_vec *bvec = imu->bvec; 1399 int start = 0, i = 0; 1400 size_t off = 0; 1401 int ret; 1402 1403 ret = validate_fixed_range(offset, iov->iov_len, imu); 1404 if (unlikely(ret)) 1405 return ret; 1406 1407 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; 1408 off += bvec[i].bv_len, i++) { 1409 if (offset >= off && offset < off + bvec[i].bv_len) 1410 start = i; 1411 } 1412 *nr_seg = i - start; 1413 return 0; 1414 } 1415 1416 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, 1417 struct io_mapped_ubuf *imu, unsigned *nr_segs) 1418 { 1419 unsigned max_segs = 0; 1420 size_t total_len = 0; 1421 unsigned i; 1422 int ret; 1423 1424 *nr_segs = 0; 1425 for (i = 0; i < nr_iovs; i++) { 1426 if (unlikely(!iov[i].iov_len)) 1427 return -EFAULT; 1428 if (unlikely(check_add_overflow(total_len, iov[i].iov_len, 1429 &total_len))) 1430 return -EOVERFLOW; 1431 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); 1432 if (unlikely(ret)) 1433 return ret; 1434 *nr_segs += max_segs; 1435 } 1436 if (total_len > MAX_RW_COUNT) 1437 return -EINVAL; 1438 return 0; 1439 } 1440 1441 int io_import_reg_vec(int ddir, struct iov_iter *iter, 1442 struct io_kiocb *req, struct iou_vec *vec, 1443 unsigned nr_iovs, unsigned issue_flags) 1444 { 1445 struct io_rsrc_node *node; 1446 struct io_mapped_ubuf *imu; 1447 unsigned iovec_off; 1448 struct iovec *iov; 1449 unsigned nr_segs; 1450 1451 node = io_find_buf_node(req, issue_flags); 1452 if (!node) 1453 return -EFAULT; 1454 imu = node->buf; 1455 if (!(imu->dir & (1 << ddir))) 1456 return -EFAULT; 1457 1458 iovec_off = vec->nr - nr_iovs; 1459 iov = vec->iovec + iovec_off; 1460 1461 if (imu->is_kbuf) { 1462 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); 1463 1464 if (unlikely(ret)) 1465 return ret; 1466 } else { 1467 nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); 1468 } 1469 1470 if (sizeof(struct bio_vec) > sizeof(struct iovec)) { 1471 size_t bvec_bytes; 1472 1473 bvec_bytes = nr_segs * sizeof(struct bio_vec); 1474 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); 1475 nr_segs += nr_iovs; 1476 } 1477 1478 if (nr_segs > vec->nr) { 1479 struct iou_vec tmp_vec = {}; 1480 int ret; 1481 1482 ret = io_vec_realloc(&tmp_vec, nr_segs); 1483 if (ret) 1484 return ret; 1485 1486 iovec_off = tmp_vec.nr - nr_iovs; 1487 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); 1488 io_vec_free(vec); 1489 1490 *vec = tmp_vec; 1491 iov = vec->iovec + iovec_off; 1492 req->flags |= REQ_F_NEED_CLEANUP; 1493 } 1494 1495 if (imu->is_kbuf) 1496 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1497 1498 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1499 } 1500 1501 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, 1502 const struct iovec __user *uvec, size_t uvec_segs) 1503 { 1504 struct iovec *iov; 1505 int iovec_off, ret; 1506 void *res; 1507 1508 if (uvec_segs > iv->nr) { 1509 ret = io_vec_realloc(iv, uvec_segs); 1510 if (ret) 1511 return ret; 1512 req->flags |= REQ_F_NEED_CLEANUP; 1513 } 1514 1515 /* pad iovec to the right */ 1516 iovec_off = iv->nr - uvec_segs; 1517 iov = iv->iovec + iovec_off; 1518 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, 1519 io_is_compat(req->ctx)); 1520 if (IS_ERR(res)) 1521 return PTR_ERR(res); 1522 1523 req->flags |= REQ_F_IMPORT_BUFFER; 1524 return 0; 1525 } 1526