1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 #include <linux/io_uring/cmd.h> 13 14 #include <uapi/linux/io_uring.h> 15 16 #include "filetable.h" 17 #include "io_uring.h" 18 #include "openclose.h" 19 #include "rsrc.h" 20 #include "memmap.h" 21 #include "register.h" 22 23 struct io_rsrc_update { 24 struct file *file; 25 u64 arg; 26 u32 nr_args; 27 u32 offset; 28 }; 29 30 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 31 struct iovec *iov); 32 33 static int hpage_acct_ref(struct io_ring_ctx *ctx, struct page *hpage, 34 bool *acct_new) 35 { 36 unsigned long key = (unsigned long) hpage; 37 unsigned long count; 38 void *entry; 39 int ret; 40 41 lockdep_assert_held(&ctx->uring_lock); 42 43 entry = xa_load(&ctx->hpage_acct, key); 44 if (entry) { 45 *acct_new = false; 46 count = xa_to_value(entry) + 1; 47 } else { 48 ret = xa_reserve(&ctx->hpage_acct, key, GFP_KERNEL_ACCOUNT); 49 if (ret) 50 return ret; 51 *acct_new = true; 52 count = 1; 53 } 54 xa_store(&ctx->hpage_acct, key, xa_mk_value(count), GFP_KERNEL_ACCOUNT); 55 return 0; 56 } 57 58 static bool hpage_acct_unref(struct io_ring_ctx *ctx, struct page *hpage) 59 { 60 unsigned long key = (unsigned long) hpage; 61 unsigned long count; 62 void *entry; 63 64 lockdep_assert_held(&ctx->uring_lock); 65 66 entry = xa_load(&ctx->hpage_acct, key); 67 if (WARN_ON_ONCE(!entry)) 68 return false; 69 count = xa_to_value(entry); 70 if (count == 1) { 71 xa_erase(&ctx->hpage_acct, key); 72 return true; 73 } 74 xa_store(&ctx->hpage_acct, key, xa_mk_value(count - 1), GFP_KERNEL_ACCOUNT); 75 return false; 76 } 77 78 /* only define max */ 79 #define IORING_MAX_FIXED_FILES (1U << 20) 80 #define IORING_MAX_REG_BUFFERS (1U << 14) 81 82 #define IO_CACHED_BVECS_SEGS 32 83 84 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 85 { 86 unsigned long page_limit, cur_pages, new_pages; 87 88 if (!nr_pages) 89 return 0; 90 91 /* Don't allow more pages than we can safely lock */ 92 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 93 94 cur_pages = atomic_long_read(&user->locked_vm); 95 do { 96 new_pages = cur_pages + nr_pages; 97 if (new_pages > page_limit) 98 return -ENOMEM; 99 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 100 &cur_pages, new_pages)); 101 return 0; 102 } 103 104 void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, 105 unsigned long nr_pages) 106 { 107 if (user) 108 __io_unaccount_mem(user, nr_pages); 109 110 if (mm_account) 111 atomic64_sub(nr_pages, &mm_account->pinned_vm); 112 } 113 114 int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, 115 unsigned long nr_pages) 116 { 117 int ret; 118 119 if (user) { 120 ret = __io_account_mem(user, nr_pages); 121 if (ret) 122 return ret; 123 } 124 125 if (mm_account) 126 atomic64_add(nr_pages, &mm_account->pinned_vm); 127 128 return 0; 129 } 130 131 int io_validate_user_buf_range(u64 uaddr, u64 ulen) 132 { 133 unsigned long tmp, base = (unsigned long)uaddr; 134 unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); 135 136 if (!ulen) 137 return -EFAULT; 138 /* 32-bit sanity checking */ 139 if (ulen > ULONG_MAX || uaddr > ULONG_MAX) 140 return -EFAULT; 141 /* cap to 1TB for 64-bit */ 142 if (ulen > SZ_1T) 143 return -EINVAL; 144 if (check_add_overflow(base, acct_len, &tmp)) 145 return -EOVERFLOW; 146 return 0; 147 } 148 149 static void io_release_ubuf(void *priv) 150 { 151 struct io_mapped_ubuf *imu = priv; 152 unsigned int i; 153 154 for (i = 0; i < imu->nr_bvecs; i++) { 155 struct folio *folio = bvec_folio(&imu->bvec[i]); 156 157 unpin_user_folio(folio, 1); 158 } 159 } 160 161 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 162 int nr_bvecs) 163 { 164 if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 165 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 166 return kvmalloc_flex(struct io_mapped_ubuf, bvec, nr_bvecs); 167 } 168 169 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 170 { 171 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 172 io_cache_free(&ctx->imu_cache, imu); 173 else 174 kvfree(imu); 175 } 176 177 static unsigned long io_buffer_unaccount_pages(struct io_ring_ctx *ctx, 178 struct io_mapped_ubuf *imu) 179 { 180 struct page *seen = NULL; 181 unsigned long acct = 0; 182 int i; 183 184 if (imu->flags & IO_REGBUF_F_KBUF || !ctx->user) 185 return 0; 186 187 for (i = 0; i < imu->nr_bvecs; i++) { 188 struct page *page = imu->bvec[i].bv_page; 189 struct page *hpage; 190 191 if (!PageCompound(page)) { 192 acct++; 193 continue; 194 } 195 196 hpage = compound_head(page); 197 if (hpage == seen) 198 continue; 199 seen = hpage; 200 201 /* Unaccount on last reference */ 202 if (hpage_acct_unref(ctx, hpage)) 203 acct += page_size(hpage) >> PAGE_SHIFT; 204 cond_resched(); 205 } 206 207 return acct; 208 } 209 210 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 211 { 212 unsigned long acct_pages = 0; 213 214 /* Always decrement, so it works for cloned buffers too */ 215 acct_pages = io_buffer_unaccount_pages(ctx, imu); 216 217 if (unlikely(refcount_read(&imu->refs) > 1)) { 218 if (!refcount_dec_and_test(&imu->refs)) 219 return; 220 } 221 222 if (acct_pages) 223 io_unaccount_mem(ctx->user, ctx->mm_account, acct_pages); 224 imu->release(imu->priv); 225 io_free_imu(ctx, imu); 226 } 227 228 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 229 { 230 struct io_rsrc_node *node; 231 232 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 233 if (node) { 234 node->type = type; 235 node->refs = 1; 236 node->tag = 0; 237 node->file_ptr = 0; 238 } 239 return node; 240 } 241 242 bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 243 { 244 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 245 IO_CACHED_BVECS_SEGS); 246 const int node_size = sizeof(struct io_rsrc_node); 247 bool ret; 248 249 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 250 node_size, 0); 251 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 252 imu_cache_size, 0); 253 return ret; 254 } 255 256 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 257 { 258 io_alloc_cache_free(&ctx->node_cache, kfree); 259 io_alloc_cache_free(&ctx->imu_cache, kvfree); 260 } 261 262 static void io_clear_table_tags(struct io_rsrc_data *data) 263 { 264 int i; 265 266 for (i = 0; i < data->nr; i++) { 267 struct io_rsrc_node *node = data->nodes[i]; 268 269 if (node) 270 node->tag = 0; 271 } 272 } 273 274 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 275 struct io_rsrc_data *data) 276 { 277 if (!data->nr) 278 return; 279 while (data->nr--) { 280 if (data->nodes[data->nr]) 281 io_put_rsrc_node(ctx, data->nodes[data->nr]); 282 } 283 kvfree(data->nodes); 284 data->nodes = NULL; 285 data->nr = 0; 286 } 287 288 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 289 { 290 data->nodes = kvmalloc_objs(struct io_rsrc_node *, nr, 291 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 292 if (data->nodes) { 293 data->nr = nr; 294 return 0; 295 } 296 return -ENOMEM; 297 } 298 299 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 300 struct io_uring_rsrc_update2 *up, 301 unsigned nr_args) 302 { 303 u64 __user *tags = u64_to_user_ptr(up->tags); 304 __s32 __user *fds = u64_to_user_ptr(up->data); 305 int fd, i, err = 0; 306 unsigned int done; 307 308 if (!ctx->file_table.data.nr) 309 return -ENXIO; 310 if (up->offset + nr_args > ctx->file_table.data.nr) 311 return -EINVAL; 312 313 for (done = 0; done < nr_args; done++) { 314 u64 tag = 0; 315 316 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 317 copy_from_user(&fd, &fds[done], sizeof(fd))) { 318 err = -EFAULT; 319 break; 320 } 321 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 322 err = -EINVAL; 323 break; 324 } 325 if (fd == IORING_REGISTER_FILES_SKIP) 326 continue; 327 328 i = up->offset + done; 329 if (i >= ctx->file_table.data.nr) 330 break; 331 i = array_index_nospec(i, ctx->file_table.data.nr); 332 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 333 io_file_bitmap_clear(&ctx->file_table, i); 334 335 if (fd != -1) { 336 struct file *file = fget(fd); 337 struct io_rsrc_node *node; 338 339 if (!file) { 340 err = -EBADF; 341 break; 342 } 343 /* 344 * Don't allow io_uring instances to be registered. 345 */ 346 if (io_is_uring_fops(file)) { 347 fput(file); 348 err = -EBADF; 349 break; 350 } 351 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 352 if (!node) { 353 err = -ENOMEM; 354 fput(file); 355 break; 356 } 357 ctx->file_table.data.nodes[i] = node; 358 if (tag) 359 node->tag = tag; 360 io_fixed_file_set(node, file); 361 io_file_bitmap_set(&ctx->file_table, i); 362 } 363 } 364 return done ? done : err; 365 } 366 367 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 368 struct io_uring_rsrc_update2 *up, 369 unsigned int nr_args) 370 { 371 u64 __user *tags = u64_to_user_ptr(up->tags); 372 struct iovec fast_iov, *iov; 373 struct iovec __user *uvec; 374 u64 user_data = up->data; 375 __u32 done; 376 int i, err; 377 378 if (!ctx->buf_table.nr) 379 return -ENXIO; 380 if (up->offset + nr_args > ctx->buf_table.nr) 381 return -EINVAL; 382 383 for (done = 0; done < nr_args; done++) { 384 struct io_rsrc_node *node; 385 u64 tag = 0; 386 387 uvec = u64_to_user_ptr(user_data); 388 iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); 389 if (IS_ERR(iov)) { 390 err = PTR_ERR(iov); 391 break; 392 } 393 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 394 err = -EFAULT; 395 break; 396 } 397 node = io_sqe_buffer_register(ctx, iov); 398 if (IS_ERR(node)) { 399 err = PTR_ERR(node); 400 break; 401 } 402 if (tag) { 403 if (!node) { 404 err = -EINVAL; 405 break; 406 } 407 node->tag = tag; 408 } 409 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 410 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 411 ctx->buf_table.nodes[i] = node; 412 if (io_is_compat(ctx)) 413 user_data += sizeof(struct compat_iovec); 414 else 415 user_data += sizeof(struct iovec); 416 } 417 return done ? done : err; 418 } 419 420 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 421 struct io_uring_rsrc_update2 *up, 422 unsigned nr_args) 423 { 424 __u32 tmp; 425 426 lockdep_assert_held(&ctx->uring_lock); 427 428 if (check_add_overflow(up->offset, nr_args, &tmp)) 429 return -EOVERFLOW; 430 431 switch (type) { 432 case IORING_RSRC_FILE: 433 return __io_sqe_files_update(ctx, up, nr_args); 434 case IORING_RSRC_BUFFER: 435 return __io_sqe_buffers_update(ctx, up, nr_args); 436 } 437 return -EINVAL; 438 } 439 440 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 441 unsigned nr_args) 442 { 443 struct io_uring_rsrc_update2 up; 444 445 if (!nr_args) 446 return -EINVAL; 447 memset(&up, 0, sizeof(up)); 448 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 449 return -EFAULT; 450 if (up.resv || up.resv2) 451 return -EINVAL; 452 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 453 } 454 455 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 456 unsigned size, unsigned type) 457 { 458 struct io_uring_rsrc_update2 up; 459 460 if (size != sizeof(up)) 461 return -EINVAL; 462 if (copy_from_user(&up, arg, sizeof(up))) 463 return -EFAULT; 464 if (!up.nr || up.resv || up.resv2) 465 return -EINVAL; 466 return __io_register_rsrc_update(ctx, type, &up, up.nr); 467 } 468 469 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 470 unsigned int size, unsigned int type) 471 { 472 struct io_uring_rsrc_register rr; 473 474 /* keep it extendible */ 475 if (size != sizeof(rr)) 476 return -EINVAL; 477 478 memset(&rr, 0, sizeof(rr)); 479 if (copy_from_user(&rr, arg, size)) 480 return -EFAULT; 481 if (!rr.nr || rr.resv2) 482 return -EINVAL; 483 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 484 return -EINVAL; 485 486 switch (type) { 487 case IORING_RSRC_FILE: 488 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 489 break; 490 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 491 rr.nr, u64_to_user_ptr(rr.tags)); 492 case IORING_RSRC_BUFFER: 493 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 494 break; 495 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 496 rr.nr, u64_to_user_ptr(rr.tags)); 497 } 498 return -EINVAL; 499 } 500 501 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 502 { 503 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 504 505 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 506 return -EINVAL; 507 if (sqe->rw_flags || sqe->splice_fd_in) 508 return -EINVAL; 509 510 up->offset = READ_ONCE(sqe->off); 511 up->nr_args = READ_ONCE(sqe->len); 512 if (!up->nr_args) 513 return -EINVAL; 514 up->arg = READ_ONCE(sqe->addr); 515 return 0; 516 } 517 518 static int io_files_update_with_index_alloc(struct io_kiocb *req, 519 unsigned int issue_flags) 520 { 521 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 522 __s32 __user *fds = u64_to_user_ptr(up->arg); 523 unsigned int done; 524 struct file *file; 525 int ret, fd; 526 527 if (!req->ctx->file_table.data.nr) 528 return -ENXIO; 529 530 for (done = 0; done < up->nr_args; done++) { 531 if (get_user(fd, &fds[done])) { 532 ret = -EFAULT; 533 break; 534 } 535 536 file = fget(fd); 537 if (!file) { 538 ret = -EBADF; 539 break; 540 } 541 ret = io_fixed_fd_install(req, issue_flags, file, 542 IORING_FILE_INDEX_ALLOC); 543 if (ret < 0) 544 break; 545 if (put_user(ret, &fds[done])) { 546 __io_close_fixed(req->ctx, issue_flags, ret); 547 ret = -EFAULT; 548 break; 549 } 550 } 551 552 if (done) 553 return done; 554 return ret; 555 } 556 557 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 558 { 559 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 560 struct io_ring_ctx *ctx = req->ctx; 561 struct io_uring_rsrc_update2 up2; 562 int ret; 563 564 up2.offset = up->offset; 565 up2.data = up->arg; 566 up2.nr = 0; 567 up2.tags = 0; 568 up2.resv = 0; 569 up2.resv2 = 0; 570 571 if (up->offset == IORING_FILE_INDEX_ALLOC) { 572 ret = io_files_update_with_index_alloc(req, issue_flags); 573 } else { 574 io_ring_submit_lock(ctx, issue_flags); 575 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 576 &up2, up->nr_args); 577 io_ring_submit_unlock(ctx, issue_flags); 578 } 579 580 if (ret < 0) 581 req_set_fail(req); 582 io_req_set_res(req, ret, 0); 583 return IOU_COMPLETE; 584 } 585 586 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 587 { 588 if (node->tag) 589 io_post_aux_cqe(ctx, node->tag, 0, 0); 590 591 switch (node->type) { 592 case IORING_RSRC_FILE: 593 fput(io_slot_file(node)); 594 break; 595 case IORING_RSRC_BUFFER: 596 io_buffer_unmap(ctx, node->buf); 597 break; 598 default: 599 WARN_ON_ONCE(1); 600 break; 601 } 602 603 io_cache_free(&ctx->node_cache, node); 604 } 605 606 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 607 { 608 if (!ctx->file_table.data.nr) 609 return -ENXIO; 610 611 io_free_file_tables(ctx, &ctx->file_table); 612 io_file_table_set_alloc_range(ctx, 0, 0); 613 return 0; 614 } 615 616 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 617 unsigned nr_args, u64 __user *tags) 618 { 619 __s32 __user *fds = (__s32 __user *) arg; 620 struct file *file; 621 int fd, ret; 622 unsigned i; 623 624 if (ctx->file_table.data.nr) 625 return -EBUSY; 626 if (!nr_args) 627 return -EINVAL; 628 if (nr_args > IORING_MAX_FIXED_FILES) 629 return -EMFILE; 630 if (nr_args > rlimit(RLIMIT_NOFILE)) 631 return -EMFILE; 632 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 633 return -ENOMEM; 634 635 for (i = 0; i < nr_args; i++) { 636 struct io_rsrc_node *node; 637 u64 tag = 0; 638 639 ret = -EFAULT; 640 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 641 goto fail; 642 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 643 goto fail; 644 /* allow sparse sets */ 645 if (!fds || fd == -1) { 646 ret = -EINVAL; 647 if (tag) 648 goto fail; 649 continue; 650 } 651 652 file = fget(fd); 653 ret = -EBADF; 654 if (unlikely(!file)) 655 goto fail; 656 657 /* 658 * Don't allow io_uring instances to be registered. 659 */ 660 if (io_is_uring_fops(file)) { 661 fput(file); 662 goto fail; 663 } 664 ret = -ENOMEM; 665 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 666 if (!node) { 667 fput(file); 668 goto fail; 669 } 670 if (tag) 671 node->tag = tag; 672 ctx->file_table.data.nodes[i] = node; 673 io_fixed_file_set(node, file); 674 io_file_bitmap_set(&ctx->file_table, i); 675 } 676 677 /* default it to the whole table */ 678 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 679 return 0; 680 fail: 681 io_clear_table_tags(&ctx->file_table.data); 682 io_sqe_files_unregister(ctx); 683 return ret; 684 } 685 686 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 687 { 688 if (!ctx->buf_table.nr) 689 return -ENXIO; 690 io_rsrc_data_free(ctx, &ctx->buf_table); 691 return 0; 692 } 693 694 /* 695 * Undo hpage_acct_ref() calls made during io_buffer_account_pin() on failure. 696 * This operates on the pages array since imu->bvec isn't populated yet. 697 */ 698 static void io_buffer_unaccount_hpages(struct io_ring_ctx *ctx, 699 struct page **pages, int nr_pages) 700 { 701 struct page *seen = NULL; 702 int i; 703 704 if (!ctx->user) 705 return; 706 707 for (i = 0; i < nr_pages; i++) { 708 struct page *hpage; 709 710 if (!PageCompound(pages[i])) 711 continue; 712 713 hpage = compound_head(pages[i]); 714 if (hpage == seen) 715 continue; 716 seen = hpage; 717 718 hpage_acct_unref(ctx, hpage); 719 cond_resched(); 720 } 721 } 722 723 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 724 int nr_pages) 725 { 726 unsigned long acct_pages = 0; 727 struct page *seen = NULL; 728 int i, ret; 729 730 if (!ctx->user) 731 return 0; 732 733 for (i = 0; i < nr_pages; i++) { 734 struct page *hpage; 735 bool acct_new; 736 737 if (!PageCompound(pages[i])) { 738 acct_pages++; 739 continue; 740 } 741 742 hpage = compound_head(pages[i]); 743 if (hpage == seen) 744 continue; 745 seen = hpage; 746 747 ret = hpage_acct_ref(ctx, hpage, &acct_new); 748 if (ret) { 749 io_buffer_unaccount_hpages(ctx, pages, i); 750 return ret; 751 } 752 if (acct_new) 753 acct_pages += page_size(hpage) >> PAGE_SHIFT; 754 cond_resched(); 755 } 756 757 /* Try to account the memory */ 758 if (acct_pages) { 759 ret = io_account_mem(ctx->user, ctx->mm_account, acct_pages); 760 if (ret) { 761 /* Undo the refs we just added */ 762 io_buffer_unaccount_hpages(ctx, pages, nr_pages); 763 return ret; 764 } 765 } 766 767 return 0; 768 } 769 770 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 771 struct io_imu_folio_data *data) 772 { 773 struct page **page_array = *pages, **new_array = NULL; 774 unsigned nr_pages_left = *nr_pages; 775 unsigned nr_folios = data->nr_folios; 776 unsigned i, j; 777 778 /* Store head pages only*/ 779 new_array = kvmalloc_objs(struct page *, nr_folios); 780 if (!new_array) 781 return false; 782 783 for (i = 0, j = 0; i < nr_folios; i++) { 784 struct page *p = compound_head(page_array[j]); 785 struct folio *folio = page_folio(p); 786 unsigned int nr; 787 788 WARN_ON_ONCE(i > 0 && p != page_array[j]); 789 790 nr = i ? data->nr_pages_mid : data->nr_pages_head; 791 nr = min(nr, nr_pages_left); 792 /* Drop all but one ref, the entire folio will remain pinned. */ 793 if (nr > 1) 794 unpin_user_folio(folio, nr - 1); 795 j += nr; 796 nr_pages_left -= nr; 797 new_array[i] = p; 798 } 799 800 WARN_ON_ONCE(j != *nr_pages); 801 802 kvfree(page_array); 803 *pages = new_array; 804 *nr_pages = nr_folios; 805 return true; 806 } 807 808 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 809 struct io_imu_folio_data *data) 810 { 811 struct folio *folio = page_folio(page_array[0]); 812 unsigned int count = 1, nr_folios = 1; 813 int i; 814 815 data->nr_pages_mid = folio_nr_pages(folio); 816 data->folio_shift = folio_shift(folio); 817 data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); 818 819 /* 820 * Check if pages are contiguous inside a folio, and all folios have 821 * the same page count except for the head and tail. 822 */ 823 for (i = 1; i < nr_pages; i++) { 824 if (page_folio(page_array[i]) == folio && 825 page_array[i] == page_array[i-1] + 1) { 826 count++; 827 continue; 828 } 829 830 if (nr_folios == 1) { 831 if (folio_page_idx(folio, page_array[i-1]) != 832 data->nr_pages_mid - 1) 833 return false; 834 835 data->nr_pages_head = count; 836 } else if (count != data->nr_pages_mid) { 837 return false; 838 } 839 840 folio = page_folio(page_array[i]); 841 if (folio_size(folio) != (1UL << data->folio_shift) || 842 folio_page_idx(folio, page_array[i]) != 0) 843 return false; 844 845 count = 1; 846 nr_folios++; 847 } 848 if (nr_folios == 1) 849 data->nr_pages_head = count; 850 851 data->nr_folios = nr_folios; 852 return true; 853 } 854 855 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 856 struct iovec *iov) 857 { 858 struct io_mapped_ubuf *imu = NULL; 859 struct page **pages = NULL; 860 struct io_rsrc_node *node; 861 unsigned long off; 862 size_t size; 863 int ret, nr_pages, i; 864 struct io_imu_folio_data data; 865 bool coalesced = false; 866 867 if (!iov->iov_base) { 868 if (iov->iov_len) 869 return ERR_PTR(-EFAULT); 870 /* remove the buffer without installing a new one */ 871 return NULL; 872 } 873 874 ret = io_validate_user_buf_range((unsigned long)iov->iov_base, 875 iov->iov_len); 876 if (ret) 877 return ERR_PTR(ret); 878 879 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 880 if (!node) 881 return ERR_PTR(-ENOMEM); 882 883 ret = -ENOMEM; 884 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 885 &nr_pages); 886 if (IS_ERR(pages)) { 887 ret = PTR_ERR(pages); 888 pages = NULL; 889 goto done; 890 } 891 892 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 893 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 894 if (data.nr_pages_mid != 1) 895 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 896 } 897 898 imu = io_alloc_imu(ctx, nr_pages); 899 if (!imu) 900 goto done; 901 902 imu->nr_bvecs = nr_pages; 903 ret = io_buffer_account_pin(ctx, pages, nr_pages); 904 if (ret) 905 goto done; 906 907 size = iov->iov_len; 908 /* store original address for later verification */ 909 imu->ubuf = (unsigned long) iov->iov_base; 910 imu->len = iov->iov_len; 911 imu->folio_shift = PAGE_SHIFT; 912 imu->release = io_release_ubuf; 913 imu->priv = imu; 914 imu->flags = 0; 915 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 916 if (coalesced) 917 imu->folio_shift = data.folio_shift; 918 refcount_set(&imu->refs, 1); 919 920 off = (unsigned long)iov->iov_base & ~PAGE_MASK; 921 if (coalesced) 922 off += data.first_folio_page_idx << PAGE_SHIFT; 923 924 node->buf = imu; 925 ret = 0; 926 927 for (i = 0; i < nr_pages; i++) { 928 size_t vec_len; 929 930 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 931 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 932 off = 0; 933 size -= vec_len; 934 } 935 done: 936 if (ret) { 937 if (imu) 938 io_free_imu(ctx, imu); 939 if (pages) { 940 for (i = 0; i < nr_pages; i++) 941 unpin_user_folio(page_folio(pages[i]), 1); 942 } 943 io_cache_free(&ctx->node_cache, node); 944 node = ERR_PTR(ret); 945 } 946 kvfree(pages); 947 return node; 948 } 949 950 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 951 unsigned int nr_args, u64 __user *tags) 952 { 953 struct io_rsrc_data data; 954 struct iovec fast_iov, *iov = &fast_iov; 955 const struct iovec __user *uvec; 956 int i, ret; 957 958 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 959 960 if (ctx->buf_table.nr) 961 return -EBUSY; 962 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 963 return -EINVAL; 964 ret = io_rsrc_data_alloc(&data, nr_args); 965 if (ret) 966 return ret; 967 968 if (!arg) 969 memset(iov, 0, sizeof(*iov)); 970 971 for (i = 0; i < nr_args; i++) { 972 struct io_rsrc_node *node; 973 u64 tag = 0; 974 975 if (arg) { 976 uvec = (struct iovec __user *) arg; 977 iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); 978 if (IS_ERR(iov)) { 979 ret = PTR_ERR(iov); 980 break; 981 } 982 if (io_is_compat(ctx)) 983 arg += sizeof(struct compat_iovec); 984 else 985 arg += sizeof(struct iovec); 986 } 987 988 if (tags) { 989 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 990 ret = -EFAULT; 991 break; 992 } 993 } 994 995 node = io_sqe_buffer_register(ctx, iov); 996 if (IS_ERR(node)) { 997 ret = PTR_ERR(node); 998 break; 999 } 1000 if (tag) { 1001 if (!node) { 1002 ret = -EINVAL; 1003 break; 1004 } 1005 node->tag = tag; 1006 } 1007 data.nodes[i] = node; 1008 } 1009 1010 ctx->buf_table = data; 1011 if (ret) { 1012 io_clear_table_tags(&ctx->buf_table); 1013 io_sqe_buffers_unregister(ctx); 1014 } 1015 return ret; 1016 } 1017 1018 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 1019 void (*release)(void *), unsigned int index, 1020 unsigned int issue_flags) 1021 { 1022 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 1023 struct io_rsrc_data *data = &ctx->buf_table; 1024 struct req_iterator rq_iter; 1025 struct io_mapped_ubuf *imu; 1026 struct io_rsrc_node *node; 1027 struct bio_vec bv; 1028 unsigned int nr_bvecs = 0; 1029 int ret = 0; 1030 1031 io_ring_submit_lock(ctx, issue_flags); 1032 if (index >= data->nr) { 1033 ret = -EINVAL; 1034 goto unlock; 1035 } 1036 index = array_index_nospec(index, data->nr); 1037 1038 if (data->nodes[index]) { 1039 ret = -EBUSY; 1040 goto unlock; 1041 } 1042 1043 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1044 if (!node) { 1045 ret = -ENOMEM; 1046 goto unlock; 1047 } 1048 1049 /* 1050 * blk_rq_nr_phys_segments() may overestimate the number of bvecs 1051 * but avoids needing to iterate over the bvecs 1052 */ 1053 imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); 1054 if (!imu) { 1055 io_cache_free(&ctx->node_cache, node); 1056 ret = -ENOMEM; 1057 goto unlock; 1058 } 1059 1060 imu->ubuf = 0; 1061 imu->len = blk_rq_bytes(rq); 1062 imu->folio_shift = PAGE_SHIFT; 1063 refcount_set(&imu->refs, 1); 1064 imu->release = release; 1065 imu->priv = rq; 1066 imu->flags = IO_REGBUF_F_KBUF; 1067 imu->dir = 1 << rq_data_dir(rq); 1068 1069 rq_for_each_bvec(bv, rq, rq_iter) 1070 imu->bvec[nr_bvecs++] = bv; 1071 imu->nr_bvecs = nr_bvecs; 1072 1073 node->buf = imu; 1074 data->nodes[index] = node; 1075 unlock: 1076 io_ring_submit_unlock(ctx, issue_flags); 1077 return ret; 1078 } 1079 EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 1080 1081 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 1082 unsigned int issue_flags) 1083 { 1084 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 1085 struct io_rsrc_data *data = &ctx->buf_table; 1086 struct io_rsrc_node *node; 1087 int ret = 0; 1088 1089 io_ring_submit_lock(ctx, issue_flags); 1090 if (index >= data->nr) { 1091 ret = -EINVAL; 1092 goto unlock; 1093 } 1094 index = array_index_nospec(index, data->nr); 1095 1096 node = data->nodes[index]; 1097 if (!node) { 1098 ret = -EINVAL; 1099 goto unlock; 1100 } 1101 if (!(node->buf->flags & IO_REGBUF_F_KBUF)) { 1102 ret = -EBUSY; 1103 goto unlock; 1104 } 1105 1106 io_put_rsrc_node(ctx, node); 1107 data->nodes[index] = NULL; 1108 unlock: 1109 io_ring_submit_unlock(ctx, issue_flags); 1110 return ret; 1111 } 1112 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 1113 1114 static int validate_fixed_range(u64 buf_addr, size_t len, 1115 const struct io_mapped_ubuf *imu) 1116 { 1117 u64 buf_end; 1118 1119 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1120 return -EFAULT; 1121 /* not inside the mapped region */ 1122 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1123 return -EFAULT; 1124 if (unlikely(len > MAX_RW_COUNT)) 1125 return -EFAULT; 1126 return 0; 1127 } 1128 1129 static int io_import_kbuf(int ddir, struct iov_iter *iter, 1130 struct io_mapped_ubuf *imu, size_t len, size_t offset) 1131 { 1132 size_t count = len + offset; 1133 1134 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); 1135 iov_iter_advance(iter, offset); 1136 return 0; 1137 } 1138 1139 static int io_import_fixed(int ddir, struct iov_iter *iter, 1140 struct io_mapped_ubuf *imu, 1141 u64 buf_addr, size_t len) 1142 { 1143 const struct bio_vec *bvec; 1144 size_t folio_mask; 1145 unsigned nr_segs; 1146 size_t offset; 1147 int ret; 1148 1149 ret = validate_fixed_range(buf_addr, len, imu); 1150 if (unlikely(ret)) 1151 return ret; 1152 if (!(imu->dir & (1 << ddir))) 1153 return -EFAULT; 1154 if (unlikely(!len)) { 1155 iov_iter_bvec(iter, ddir, NULL, 0, 0); 1156 return 0; 1157 } 1158 1159 offset = buf_addr - imu->ubuf; 1160 1161 if (imu->flags & IO_REGBUF_F_KBUF) 1162 return io_import_kbuf(ddir, iter, imu, len, offset); 1163 1164 /* 1165 * Don't use iov_iter_advance() here, as it's really slow for 1166 * using the latter parts of a big fixed buffer - it iterates 1167 * over each segment manually. We can cheat a bit here for user 1168 * registered nodes, because we know that: 1169 * 1170 * 1) it's a BVEC iter, we set it up 1171 * 2) all bvecs are the same in size, except potentially the 1172 * first and last bvec 1173 */ 1174 folio_mask = (1UL << imu->folio_shift) - 1; 1175 bvec = imu->bvec; 1176 if (offset >= bvec->bv_len) { 1177 unsigned long seg_skip; 1178 1179 /* skip first vec */ 1180 offset -= bvec->bv_len; 1181 seg_skip = 1 + (offset >> imu->folio_shift); 1182 bvec += seg_skip; 1183 offset &= folio_mask; 1184 } 1185 nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; 1186 iov_iter_bvec(iter, ddir, bvec, nr_segs, len); 1187 iter->iov_offset = offset; 1188 return 0; 1189 } 1190 1191 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 1192 unsigned issue_flags) 1193 { 1194 struct io_ring_ctx *ctx = req->ctx; 1195 struct io_rsrc_node *node; 1196 1197 if (req->flags & REQ_F_BUF_NODE) 1198 return req->buf_node; 1199 req->flags |= REQ_F_BUF_NODE; 1200 1201 io_ring_submit_lock(ctx, issue_flags); 1202 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 1203 if (node) { 1204 node->refs++; 1205 req->buf_node = node; 1206 io_ring_submit_unlock(ctx, issue_flags); 1207 return node; 1208 } 1209 req->flags &= ~REQ_F_BUF_NODE; 1210 io_ring_submit_unlock(ctx, issue_flags); 1211 return NULL; 1212 } 1213 1214 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 1215 u64 buf_addr, size_t len, int ddir, 1216 unsigned issue_flags) 1217 { 1218 struct io_rsrc_node *node; 1219 1220 node = io_find_buf_node(req, issue_flags); 1221 if (!node) 1222 return -EFAULT; 1223 return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1224 } 1225 1226 static int io_buffer_acct_cloned_hpages(struct io_ring_ctx *ctx, 1227 struct io_mapped_ubuf *imu) 1228 { 1229 struct page *seen = NULL; 1230 int i, ret = 0; 1231 1232 if (imu->flags & IO_REGBUF_F_KBUF || !ctx->user) 1233 return 0; 1234 1235 for (i = 0; i < imu->nr_bvecs; i++) { 1236 struct page *page = imu->bvec[i].bv_page; 1237 struct page *hpage; 1238 bool acct_new; 1239 1240 if (!PageCompound(page)) 1241 continue; 1242 1243 hpage = compound_head(page); 1244 if (hpage == seen) 1245 continue; 1246 seen = hpage; 1247 1248 /* Atomically add reference for cloned buffer */ 1249 ret = hpage_acct_ref(ctx, hpage, &acct_new); 1250 if (ret) 1251 break; 1252 1253 cond_resched(); 1254 } 1255 1256 if (!ret) 1257 return 0; 1258 1259 /* Undo refs we added for bvecs [0..i) */ 1260 seen = NULL; 1261 for (int j = 0; j < i; j++) { 1262 struct page *p = imu->bvec[j].bv_page; 1263 struct page *hp; 1264 1265 if (!PageCompound(p)) 1266 continue; 1267 hp = compound_head(p); 1268 if (hp == seen) 1269 continue; 1270 seen = hp; 1271 hpage_acct_unref(ctx, hp); 1272 } 1273 return ret; 1274 } 1275 1276 /* Lock two rings at once. The rings must be different! */ 1277 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 1278 { 1279 if (ctx1 > ctx2) 1280 swap(ctx1, ctx2); 1281 mutex_lock(&ctx1->uring_lock); 1282 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 1283 } 1284 1285 /* Both rings are locked by the caller. */ 1286 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 1287 struct io_uring_clone_buffers *arg) 1288 { 1289 struct io_rsrc_data data; 1290 int i, ret, off, nr; 1291 unsigned int nbufs; 1292 1293 lockdep_assert_held(&ctx->uring_lock); 1294 lockdep_assert_held(&src_ctx->uring_lock); 1295 1296 /* 1297 * Accounting state is shared between the two rings; that only works if 1298 * both rings are accounted towards the same counters. 1299 */ 1300 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 1301 return -EINVAL; 1302 1303 /* if offsets are given, must have nr specified too */ 1304 if (!arg->nr && (arg->dst_off || arg->src_off)) 1305 return -EINVAL; 1306 /* not allowed unless REPLACE is set */ 1307 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 1308 return -EBUSY; 1309 1310 nbufs = src_ctx->buf_table.nr; 1311 if (!nbufs) 1312 return -ENXIO; 1313 if (!arg->nr) 1314 arg->nr = nbufs; 1315 else if (arg->nr > nbufs) 1316 return -EINVAL; 1317 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1318 return -EINVAL; 1319 if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs) 1320 return -EOVERFLOW; 1321 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1322 return -EOVERFLOW; 1323 if (nbufs > IORING_MAX_REG_BUFFERS) 1324 return -EINVAL; 1325 1326 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 1327 if (ret) 1328 return ret; 1329 1330 /* Copy original dst nodes from before the cloned range */ 1331 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1332 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 1333 1334 if (node) { 1335 data.nodes[i] = node; 1336 node->refs++; 1337 } 1338 } 1339 1340 off = arg->dst_off; 1341 i = arg->src_off; 1342 nr = arg->nr; 1343 while (nr--) { 1344 struct io_rsrc_node *dst_node, *src_node; 1345 1346 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1347 if (!src_node) { 1348 dst_node = NULL; 1349 } else { 1350 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1351 if (!dst_node) { 1352 io_rsrc_data_free(ctx, &data); 1353 return -ENOMEM; 1354 } 1355 1356 refcount_inc(&src_node->buf->refs); 1357 dst_node->buf = src_node->buf; 1358 /* track compound references to clones */ 1359 ret = io_buffer_acct_cloned_hpages(ctx, src_node->buf); 1360 if (ret) { 1361 refcount_dec(&src_node->buf->refs); 1362 io_cache_free(&ctx->node_cache, dst_node); 1363 io_rsrc_data_free(ctx, &data); 1364 return ret; 1365 } 1366 } 1367 data.nodes[off++] = dst_node; 1368 i++; 1369 } 1370 1371 /* Copy original dst nodes from after the cloned range */ 1372 for (i = nbufs; i < ctx->buf_table.nr; i++) { 1373 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 1374 1375 if (node) { 1376 data.nodes[i] = node; 1377 node->refs++; 1378 } 1379 } 1380 1381 /* 1382 * If asked for replace, put the old table. data->nodes[] holds both 1383 * old and new nodes at this point. 1384 */ 1385 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1386 io_rsrc_data_free(ctx, &ctx->buf_table); 1387 1388 /* 1389 * ctx->buf_table must be empty now - either the contents are being 1390 * replaced and we just freed the table, or the contents are being 1391 * copied to a ring that does not have buffers yet (checked at function 1392 * entry). 1393 */ 1394 WARN_ON_ONCE(ctx->buf_table.nr); 1395 ctx->buf_table = data; 1396 return 0; 1397 } 1398 1399 /* 1400 * Copy the registered buffers from the source ring whose file descriptor 1401 * is given in the src_fd to the current ring. This is identical to registering 1402 * the buffers with ctx, except faster as mappings already exist. 1403 * 1404 * Since the memory is already accounted once, don't account it again. 1405 */ 1406 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1407 { 1408 struct io_uring_clone_buffers buf; 1409 struct io_ring_ctx *src_ctx; 1410 bool registered_src; 1411 struct file *file; 1412 int ret; 1413 1414 if (copy_from_user(&buf, arg, sizeof(buf))) 1415 return -EFAULT; 1416 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1417 return -EINVAL; 1418 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1419 return -EBUSY; 1420 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1421 return -EINVAL; 1422 1423 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1424 file = io_uring_ctx_get_file(buf.src_fd, registered_src); 1425 if (IS_ERR(file)) 1426 return PTR_ERR(file); 1427 1428 src_ctx = file->private_data; 1429 if (src_ctx != ctx) { 1430 mutex_unlock(&ctx->uring_lock); 1431 lock_two_rings(ctx, src_ctx); 1432 1433 if (src_ctx->submitter_task && 1434 src_ctx->submitter_task != current) { 1435 ret = -EEXIST; 1436 goto out; 1437 } 1438 } 1439 1440 ret = io_clone_buffers(ctx, src_ctx, &buf); 1441 1442 out: 1443 if (src_ctx != ctx) 1444 mutex_unlock(&src_ctx->uring_lock); 1445 1446 if (!registered_src) 1447 fput(file); 1448 return ret; 1449 } 1450 1451 void io_vec_free(struct iou_vec *iv) 1452 { 1453 if (!iv->iovec) 1454 return; 1455 kfree(iv->iovec); 1456 iv->iovec = NULL; 1457 iv->nr = 0; 1458 } 1459 1460 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1461 { 1462 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_NOWARN; 1463 struct iovec *iov; 1464 1465 iov = kmalloc_objs(iov[0], nr_entries, gfp); 1466 if (!iov) 1467 return -ENOMEM; 1468 1469 io_vec_free(iv); 1470 iv->iovec = iov; 1471 iv->nr = nr_entries; 1472 return 0; 1473 } 1474 1475 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, 1476 struct io_mapped_ubuf *imu, 1477 struct iovec *iovec, unsigned nr_iovs, 1478 struct iou_vec *vec) 1479 { 1480 unsigned long folio_size = 1 << imu->folio_shift; 1481 unsigned long folio_mask = folio_size - 1; 1482 struct bio_vec *res_bvec = vec->bvec; 1483 size_t total_len = 0; 1484 unsigned bvec_idx = 0; 1485 unsigned iov_idx; 1486 1487 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1488 size_t iov_len = iovec[iov_idx].iov_len; 1489 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; 1490 struct bio_vec *src_bvec; 1491 size_t offset; 1492 int ret; 1493 1494 ret = validate_fixed_range(buf_addr, iov_len, imu); 1495 if (unlikely(ret)) 1496 return ret; 1497 1498 if (unlikely(!iov_len)) 1499 return -EFAULT; 1500 if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) 1501 return -EOVERFLOW; 1502 1503 offset = buf_addr - imu->ubuf; 1504 /* 1505 * Only the first bvec can have non zero bv_offset, account it 1506 * here and work with full folios below. 1507 */ 1508 offset += imu->bvec[0].bv_offset; 1509 1510 src_bvec = imu->bvec + (offset >> imu->folio_shift); 1511 offset &= folio_mask; 1512 1513 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { 1514 size_t seg_size = min_t(size_t, iov_len, 1515 folio_size - offset); 1516 1517 bvec_set_page(&res_bvec[bvec_idx], 1518 src_bvec->bv_page, seg_size, offset); 1519 iov_len -= seg_size; 1520 } 1521 } 1522 if (total_len > MAX_RW_COUNT) 1523 return -EINVAL; 1524 1525 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); 1526 return 0; 1527 } 1528 1529 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, 1530 struct io_mapped_ubuf *imu) 1531 { 1532 unsigned shift = imu->folio_shift; 1533 size_t max_segs = 0; 1534 unsigned i; 1535 1536 for (i = 0; i < nr_iovs; i++) { 1537 max_segs += (iov[i].iov_len >> shift) + 2; 1538 if (max_segs > INT_MAX) 1539 return -EOVERFLOW; 1540 } 1541 return max_segs; 1542 } 1543 1544 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, 1545 struct io_mapped_ubuf *imu, 1546 struct iovec *iovec, unsigned nr_iovs, 1547 struct iou_vec *vec) 1548 { 1549 const struct bio_vec *src_bvec = imu->bvec; 1550 struct bio_vec *res_bvec = vec->bvec; 1551 unsigned res_idx = 0; 1552 size_t total_len = 0; 1553 unsigned iov_idx; 1554 1555 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1556 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; 1557 size_t iov_len = iovec[iov_idx].iov_len; 1558 struct bvec_iter bi = { 1559 .bi_size = offset + iov_len, 1560 }; 1561 struct bio_vec bv; 1562 1563 bvec_iter_advance(src_bvec, &bi, offset); 1564 for_each_mp_bvec(bv, src_bvec, bi, bi) 1565 res_bvec[res_idx++] = bv; 1566 total_len += iov_len; 1567 } 1568 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); 1569 return 0; 1570 } 1571 1572 static int iov_kern_bvec_size(const struct iovec *iov, 1573 const struct io_mapped_ubuf *imu, 1574 unsigned int *nr_seg) 1575 { 1576 size_t offset = (size_t)(uintptr_t)iov->iov_base; 1577 const struct bio_vec *bvec = imu->bvec; 1578 int start = 0, i = 0; 1579 size_t off = 0; 1580 int ret; 1581 1582 ret = validate_fixed_range(offset, iov->iov_len, imu); 1583 if (unlikely(ret)) 1584 return ret; 1585 1586 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; 1587 off += bvec[i].bv_len, i++) { 1588 if (offset >= off && offset < off + bvec[i].bv_len) 1589 start = i; 1590 } 1591 *nr_seg = i - start; 1592 return 0; 1593 } 1594 1595 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, 1596 struct io_mapped_ubuf *imu, unsigned *nr_segs) 1597 { 1598 unsigned max_segs = 0; 1599 size_t total_len = 0; 1600 unsigned i; 1601 int ret; 1602 1603 *nr_segs = 0; 1604 for (i = 0; i < nr_iovs; i++) { 1605 if (unlikely(!iov[i].iov_len)) 1606 return -EFAULT; 1607 if (unlikely(check_add_overflow(total_len, iov[i].iov_len, 1608 &total_len))) 1609 return -EOVERFLOW; 1610 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); 1611 if (unlikely(ret)) 1612 return ret; 1613 *nr_segs += max_segs; 1614 } 1615 if (total_len > MAX_RW_COUNT) 1616 return -EINVAL; 1617 return 0; 1618 } 1619 1620 int io_import_reg_vec(int ddir, struct iov_iter *iter, 1621 struct io_kiocb *req, struct iou_vec *vec, 1622 unsigned nr_iovs, unsigned issue_flags) 1623 { 1624 struct io_rsrc_node *node; 1625 struct io_mapped_ubuf *imu; 1626 unsigned iovec_off; 1627 struct iovec *iov; 1628 unsigned nr_segs; 1629 1630 node = io_find_buf_node(req, issue_flags); 1631 if (!node) 1632 return -EFAULT; 1633 imu = node->buf; 1634 if (!(imu->dir & (1 << ddir))) 1635 return -EFAULT; 1636 1637 iovec_off = vec->nr - nr_iovs; 1638 iov = vec->iovec + iovec_off; 1639 1640 if (imu->flags & IO_REGBUF_F_KBUF) { 1641 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); 1642 1643 if (unlikely(ret)) 1644 return ret; 1645 } else { 1646 int ret = io_estimate_bvec_size(iov, nr_iovs, imu); 1647 1648 if (ret < 0) 1649 return ret; 1650 nr_segs = ret; 1651 } 1652 1653 if (sizeof(struct bio_vec) > sizeof(struct iovec)) { 1654 size_t bvec_bytes; 1655 1656 bvec_bytes = nr_segs * sizeof(struct bio_vec); 1657 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); 1658 nr_segs += nr_iovs; 1659 } 1660 1661 if (nr_segs > vec->nr) { 1662 struct iou_vec tmp_vec = {}; 1663 int ret; 1664 1665 ret = io_vec_realloc(&tmp_vec, nr_segs); 1666 if (ret) 1667 return ret; 1668 1669 iovec_off = tmp_vec.nr - nr_iovs; 1670 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); 1671 io_vec_free(vec); 1672 1673 *vec = tmp_vec; 1674 iov = vec->iovec + iovec_off; 1675 req->flags |= REQ_F_NEED_CLEANUP; 1676 } 1677 1678 if (imu->flags & IO_REGBUF_F_KBUF) 1679 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1680 1681 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1682 } 1683 1684 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, 1685 const struct iovec __user *uvec, size_t uvec_segs) 1686 { 1687 struct iovec *iov; 1688 int iovec_off, ret; 1689 void *res; 1690 1691 if (uvec_segs > iv->nr) { 1692 ret = io_vec_realloc(iv, uvec_segs); 1693 if (ret) 1694 return ret; 1695 req->flags |= REQ_F_NEED_CLEANUP; 1696 } 1697 1698 /* pad iovec to the right */ 1699 iovec_off = iv->nr - uvec_segs; 1700 iov = iv->iovec + iovec_off; 1701 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, 1702 io_is_compat(req->ctx)); 1703 if (IS_ERR(res)) 1704 return PTR_ERR(res); 1705 1706 req->flags |= REQ_F_IMPORT_BUFFER; 1707 return 0; 1708 } 1709