1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 13 #include <uapi/linux/io_uring.h> 14 15 #include "io_uring.h" 16 #include "openclose.h" 17 #include "rsrc.h" 18 #include "memmap.h" 19 #include "register.h" 20 21 struct io_rsrc_update { 22 struct file *file; 23 u64 arg; 24 u32 nr_args; 25 u32 offset; 26 }; 27 28 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 29 struct iovec *iov, struct page **last_hpage); 30 31 /* only define max */ 32 #define IORING_MAX_FIXED_FILES (1U << 20) 33 #define IORING_MAX_REG_BUFFERS (1U << 14) 34 35 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 36 { 37 unsigned long page_limit, cur_pages, new_pages; 38 39 if (!nr_pages) 40 return 0; 41 42 /* Don't allow more pages than we can safely lock */ 43 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 44 45 cur_pages = atomic_long_read(&user->locked_vm); 46 do { 47 new_pages = cur_pages + nr_pages; 48 if (new_pages > page_limit) 49 return -ENOMEM; 50 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 51 &cur_pages, new_pages)); 52 return 0; 53 } 54 55 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 56 { 57 if (ctx->user) 58 __io_unaccount_mem(ctx->user, nr_pages); 59 60 if (ctx->mm_account) 61 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 62 } 63 64 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 65 { 66 int ret; 67 68 if (ctx->user) { 69 ret = __io_account_mem(ctx->user, nr_pages); 70 if (ret) 71 return ret; 72 } 73 74 if (ctx->mm_account) 75 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 76 77 return 0; 78 } 79 80 static int io_buffer_validate(struct iovec *iov) 81 { 82 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 83 84 /* 85 * Don't impose further limits on the size and buffer 86 * constraints here, we'll -EINVAL later when IO is 87 * submitted if they are wrong. 88 */ 89 if (!iov->iov_base) 90 return iov->iov_len ? -EFAULT : 0; 91 if (!iov->iov_len) 92 return -EFAULT; 93 94 /* arbitrary limit, but we need something */ 95 if (iov->iov_len > SZ_1G) 96 return -EFAULT; 97 98 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 99 return -EOVERFLOW; 100 101 return 0; 102 } 103 104 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 105 { 106 unsigned int i; 107 108 if (node->buf) { 109 struct io_mapped_ubuf *imu = node->buf; 110 111 if (!refcount_dec_and_test(&imu->refs)) 112 return; 113 for (i = 0; i < imu->nr_bvecs; i++) 114 unpin_user_page(imu->bvec[i].bv_page); 115 if (imu->acct_pages) 116 io_unaccount_mem(ctx, imu->acct_pages); 117 kvfree(imu); 118 } 119 } 120 121 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 122 { 123 struct io_rsrc_node *node; 124 125 node = kzalloc(sizeof(*node), GFP_KERNEL); 126 if (node) { 127 node->type = type; 128 node->refs = 1; 129 } 130 return node; 131 } 132 133 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) 134 { 135 if (!data->nr) 136 return; 137 while (data->nr--) { 138 if (data->nodes[data->nr]) 139 io_put_rsrc_node(ctx, data->nodes[data->nr]); 140 } 141 kvfree(data->nodes); 142 data->nodes = NULL; 143 data->nr = 0; 144 } 145 146 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 147 { 148 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), 149 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 150 if (data->nodes) { 151 data->nr = nr; 152 return 0; 153 } 154 return -ENOMEM; 155 } 156 157 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 158 struct io_uring_rsrc_update2 *up, 159 unsigned nr_args) 160 { 161 u64 __user *tags = u64_to_user_ptr(up->tags); 162 __s32 __user *fds = u64_to_user_ptr(up->data); 163 int fd, i, err = 0; 164 unsigned int done; 165 166 if (!ctx->file_table.data.nr) 167 return -ENXIO; 168 if (up->offset + nr_args > ctx->file_table.data.nr) 169 return -EINVAL; 170 171 for (done = 0; done < nr_args; done++) { 172 u64 tag = 0; 173 174 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 175 copy_from_user(&fd, &fds[done], sizeof(fd))) { 176 err = -EFAULT; 177 break; 178 } 179 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 180 err = -EINVAL; 181 break; 182 } 183 if (fd == IORING_REGISTER_FILES_SKIP) 184 continue; 185 186 i = up->offset + done; 187 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 188 io_file_bitmap_clear(&ctx->file_table, i); 189 190 if (fd != -1) { 191 struct file *file = fget(fd); 192 struct io_rsrc_node *node; 193 194 if (!file) { 195 err = -EBADF; 196 break; 197 } 198 /* 199 * Don't allow io_uring instances to be registered. 200 */ 201 if (io_is_uring_fops(file)) { 202 fput(file); 203 err = -EBADF; 204 break; 205 } 206 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 207 if (!node) { 208 err = -ENOMEM; 209 fput(file); 210 break; 211 } 212 ctx->file_table.data.nodes[i] = node; 213 if (tag) 214 node->tag = tag; 215 io_fixed_file_set(node, file); 216 io_file_bitmap_set(&ctx->file_table, i); 217 } 218 } 219 return done ? done : err; 220 } 221 222 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 223 struct io_uring_rsrc_update2 *up, 224 unsigned int nr_args) 225 { 226 u64 __user *tags = u64_to_user_ptr(up->tags); 227 struct iovec fast_iov, *iov; 228 struct page *last_hpage = NULL; 229 struct iovec __user *uvec; 230 u64 user_data = up->data; 231 __u32 done; 232 int i, err; 233 234 if (!ctx->buf_table.nr) 235 return -ENXIO; 236 if (up->offset + nr_args > ctx->buf_table.nr) 237 return -EINVAL; 238 239 for (done = 0; done < nr_args; done++) { 240 struct io_rsrc_node *node; 241 u64 tag = 0; 242 243 uvec = u64_to_user_ptr(user_data); 244 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 245 if (IS_ERR(iov)) { 246 err = PTR_ERR(iov); 247 break; 248 } 249 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 250 err = -EFAULT; 251 break; 252 } 253 err = io_buffer_validate(iov); 254 if (err) 255 break; 256 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 257 if (IS_ERR(node)) { 258 err = PTR_ERR(node); 259 break; 260 } 261 if (tag) { 262 if (!node) { 263 err = -EINVAL; 264 break; 265 } 266 node->tag = tag; 267 } 268 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 269 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 270 ctx->buf_table.nodes[i] = node; 271 if (ctx->compat) 272 user_data += sizeof(struct compat_iovec); 273 else 274 user_data += sizeof(struct iovec); 275 } 276 return done ? done : err; 277 } 278 279 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 280 struct io_uring_rsrc_update2 *up, 281 unsigned nr_args) 282 { 283 __u32 tmp; 284 285 lockdep_assert_held(&ctx->uring_lock); 286 287 if (check_add_overflow(up->offset, nr_args, &tmp)) 288 return -EOVERFLOW; 289 290 switch (type) { 291 case IORING_RSRC_FILE: 292 return __io_sqe_files_update(ctx, up, nr_args); 293 case IORING_RSRC_BUFFER: 294 return __io_sqe_buffers_update(ctx, up, nr_args); 295 } 296 return -EINVAL; 297 } 298 299 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 300 unsigned nr_args) 301 { 302 struct io_uring_rsrc_update2 up; 303 304 if (!nr_args) 305 return -EINVAL; 306 memset(&up, 0, sizeof(up)); 307 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 308 return -EFAULT; 309 if (up.resv || up.resv2) 310 return -EINVAL; 311 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 312 } 313 314 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 315 unsigned size, unsigned type) 316 { 317 struct io_uring_rsrc_update2 up; 318 319 if (size != sizeof(up)) 320 return -EINVAL; 321 if (copy_from_user(&up, arg, sizeof(up))) 322 return -EFAULT; 323 if (!up.nr || up.resv || up.resv2) 324 return -EINVAL; 325 return __io_register_rsrc_update(ctx, type, &up, up.nr); 326 } 327 328 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 329 unsigned int size, unsigned int type) 330 { 331 struct io_uring_rsrc_register rr; 332 333 /* keep it extendible */ 334 if (size != sizeof(rr)) 335 return -EINVAL; 336 337 memset(&rr, 0, sizeof(rr)); 338 if (copy_from_user(&rr, arg, size)) 339 return -EFAULT; 340 if (!rr.nr || rr.resv2) 341 return -EINVAL; 342 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 343 return -EINVAL; 344 345 switch (type) { 346 case IORING_RSRC_FILE: 347 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 348 break; 349 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 350 rr.nr, u64_to_user_ptr(rr.tags)); 351 case IORING_RSRC_BUFFER: 352 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 353 break; 354 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 355 rr.nr, u64_to_user_ptr(rr.tags)); 356 } 357 return -EINVAL; 358 } 359 360 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 361 { 362 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 363 364 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 365 return -EINVAL; 366 if (sqe->rw_flags || sqe->splice_fd_in) 367 return -EINVAL; 368 369 up->offset = READ_ONCE(sqe->off); 370 up->nr_args = READ_ONCE(sqe->len); 371 if (!up->nr_args) 372 return -EINVAL; 373 up->arg = READ_ONCE(sqe->addr); 374 return 0; 375 } 376 377 static int io_files_update_with_index_alloc(struct io_kiocb *req, 378 unsigned int issue_flags) 379 { 380 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 381 __s32 __user *fds = u64_to_user_ptr(up->arg); 382 unsigned int done; 383 struct file *file; 384 int ret, fd; 385 386 if (!req->ctx->file_table.data.nr) 387 return -ENXIO; 388 389 for (done = 0; done < up->nr_args; done++) { 390 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 391 ret = -EFAULT; 392 break; 393 } 394 395 file = fget(fd); 396 if (!file) { 397 ret = -EBADF; 398 break; 399 } 400 ret = io_fixed_fd_install(req, issue_flags, file, 401 IORING_FILE_INDEX_ALLOC); 402 if (ret < 0) 403 break; 404 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 405 __io_close_fixed(req->ctx, issue_flags, ret); 406 ret = -EFAULT; 407 break; 408 } 409 } 410 411 if (done) 412 return done; 413 return ret; 414 } 415 416 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 417 { 418 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 419 struct io_ring_ctx *ctx = req->ctx; 420 struct io_uring_rsrc_update2 up2; 421 int ret; 422 423 up2.offset = up->offset; 424 up2.data = up->arg; 425 up2.nr = 0; 426 up2.tags = 0; 427 up2.resv = 0; 428 up2.resv2 = 0; 429 430 if (up->offset == IORING_FILE_INDEX_ALLOC) { 431 ret = io_files_update_with_index_alloc(req, issue_flags); 432 } else { 433 io_ring_submit_lock(ctx, issue_flags); 434 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 435 &up2, up->nr_args); 436 io_ring_submit_unlock(ctx, issue_flags); 437 } 438 439 if (ret < 0) 440 req_set_fail(req); 441 io_req_set_res(req, ret, 0); 442 return IOU_OK; 443 } 444 445 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 446 { 447 lockdep_assert_held(&ctx->uring_lock); 448 449 if (node->tag) 450 io_post_aux_cqe(ctx, node->tag, 0, 0); 451 452 switch (node->type) { 453 case IORING_RSRC_FILE: 454 if (io_slot_file(node)) 455 fput(io_slot_file(node)); 456 break; 457 case IORING_RSRC_BUFFER: 458 if (node->buf) 459 io_buffer_unmap(ctx, node); 460 break; 461 default: 462 WARN_ON_ONCE(1); 463 break; 464 } 465 466 kfree(node); 467 } 468 469 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 470 { 471 if (!ctx->file_table.data.nr) 472 return -ENXIO; 473 474 io_free_file_tables(ctx, &ctx->file_table); 475 io_file_table_set_alloc_range(ctx, 0, 0); 476 return 0; 477 } 478 479 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 480 unsigned nr_args, u64 __user *tags) 481 { 482 __s32 __user *fds = (__s32 __user *) arg; 483 struct file *file; 484 int fd, ret; 485 unsigned i; 486 487 if (ctx->file_table.data.nr) 488 return -EBUSY; 489 if (!nr_args) 490 return -EINVAL; 491 if (nr_args > IORING_MAX_FIXED_FILES) 492 return -EMFILE; 493 if (nr_args > rlimit(RLIMIT_NOFILE)) 494 return -EMFILE; 495 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 496 return -ENOMEM; 497 498 for (i = 0; i < nr_args; i++) { 499 struct io_rsrc_node *node; 500 u64 tag = 0; 501 502 ret = -EFAULT; 503 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 504 goto fail; 505 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 506 goto fail; 507 /* allow sparse sets */ 508 if (!fds || fd == -1) { 509 ret = -EINVAL; 510 if (tag) 511 goto fail; 512 continue; 513 } 514 515 file = fget(fd); 516 ret = -EBADF; 517 if (unlikely(!file)) 518 goto fail; 519 520 /* 521 * Don't allow io_uring instances to be registered. 522 */ 523 if (io_is_uring_fops(file)) { 524 fput(file); 525 goto fail; 526 } 527 ret = -ENOMEM; 528 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 529 if (!node) { 530 fput(file); 531 goto fail; 532 } 533 if (tag) 534 node->tag = tag; 535 ctx->file_table.data.nodes[i] = node; 536 io_fixed_file_set(node, file); 537 io_file_bitmap_set(&ctx->file_table, i); 538 } 539 540 /* default it to the whole table */ 541 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 542 return 0; 543 fail: 544 io_sqe_files_unregister(ctx); 545 return ret; 546 } 547 548 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 549 { 550 if (!ctx->buf_table.nr) 551 return -ENXIO; 552 io_rsrc_data_free(ctx, &ctx->buf_table); 553 return 0; 554 } 555 556 /* 557 * Not super efficient, but this is just a registration time. And we do cache 558 * the last compound head, so generally we'll only do a full search if we don't 559 * match that one. 560 * 561 * We check if the given compound head page has already been accounted, to 562 * avoid double accounting it. This allows us to account the full size of the 563 * page, not just the constituent pages of a huge page. 564 */ 565 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 566 int nr_pages, struct page *hpage) 567 { 568 int i, j; 569 570 /* check current page array */ 571 for (i = 0; i < nr_pages; i++) { 572 if (!PageCompound(pages[i])) 573 continue; 574 if (compound_head(pages[i]) == hpage) 575 return true; 576 } 577 578 /* check previously registered pages */ 579 for (i = 0; i < ctx->buf_table.nr; i++) { 580 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 581 struct io_mapped_ubuf *imu; 582 583 if (!node) 584 continue; 585 imu = node->buf; 586 for (j = 0; j < imu->nr_bvecs; j++) { 587 if (!PageCompound(imu->bvec[j].bv_page)) 588 continue; 589 if (compound_head(imu->bvec[j].bv_page) == hpage) 590 return true; 591 } 592 } 593 594 return false; 595 } 596 597 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 598 int nr_pages, struct io_mapped_ubuf *imu, 599 struct page **last_hpage) 600 { 601 int i, ret; 602 603 imu->acct_pages = 0; 604 for (i = 0; i < nr_pages; i++) { 605 if (!PageCompound(pages[i])) { 606 imu->acct_pages++; 607 } else { 608 struct page *hpage; 609 610 hpage = compound_head(pages[i]); 611 if (hpage == *last_hpage) 612 continue; 613 *last_hpage = hpage; 614 if (headpage_already_acct(ctx, pages, i, hpage)) 615 continue; 616 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 617 } 618 } 619 620 if (!imu->acct_pages) 621 return 0; 622 623 ret = io_account_mem(ctx, imu->acct_pages); 624 if (ret) 625 imu->acct_pages = 0; 626 return ret; 627 } 628 629 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 630 struct io_imu_folio_data *data) 631 { 632 struct page **page_array = *pages, **new_array = NULL; 633 int nr_pages_left = *nr_pages, i, j; 634 int nr_folios = data->nr_folios; 635 636 /* Store head pages only*/ 637 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), 638 GFP_KERNEL); 639 if (!new_array) 640 return false; 641 642 new_array[0] = compound_head(page_array[0]); 643 /* 644 * The pages are bound to the folio, it doesn't 645 * actually unpin them but drops all but one reference, 646 * which is usually put down by io_buffer_unmap(). 647 * Note, needs a better helper. 648 */ 649 if (data->nr_pages_head > 1) 650 unpin_user_pages(&page_array[1], data->nr_pages_head - 1); 651 652 j = data->nr_pages_head; 653 nr_pages_left -= data->nr_pages_head; 654 for (i = 1; i < nr_folios; i++) { 655 unsigned int nr_unpin; 656 657 new_array[i] = page_array[j]; 658 nr_unpin = min_t(unsigned int, nr_pages_left - 1, 659 data->nr_pages_mid - 1); 660 if (nr_unpin) 661 unpin_user_pages(&page_array[j+1], nr_unpin); 662 j += data->nr_pages_mid; 663 nr_pages_left -= data->nr_pages_mid; 664 } 665 kvfree(page_array); 666 *pages = new_array; 667 *nr_pages = nr_folios; 668 return true; 669 } 670 671 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 672 struct io_imu_folio_data *data) 673 { 674 struct folio *folio = page_folio(page_array[0]); 675 unsigned int count = 1, nr_folios = 1; 676 int i; 677 678 data->nr_pages_mid = folio_nr_pages(folio); 679 data->folio_shift = folio_shift(folio); 680 681 /* 682 * Check if pages are contiguous inside a folio, and all folios have 683 * the same page count except for the head and tail. 684 */ 685 for (i = 1; i < nr_pages; i++) { 686 if (page_folio(page_array[i]) == folio && 687 page_array[i] == page_array[i-1] + 1) { 688 count++; 689 continue; 690 } 691 692 if (nr_folios == 1) { 693 if (folio_page_idx(folio, page_array[i-1]) != 694 data->nr_pages_mid - 1) 695 return false; 696 697 data->nr_pages_head = count; 698 } else if (count != data->nr_pages_mid) { 699 return false; 700 } 701 702 folio = page_folio(page_array[i]); 703 if (folio_size(folio) != (1UL << data->folio_shift) || 704 folio_page_idx(folio, page_array[i]) != 0) 705 return false; 706 707 count = 1; 708 nr_folios++; 709 } 710 if (nr_folios == 1) 711 data->nr_pages_head = count; 712 713 data->nr_folios = nr_folios; 714 return true; 715 } 716 717 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 718 struct iovec *iov, 719 struct page **last_hpage) 720 { 721 struct io_mapped_ubuf *imu = NULL; 722 struct page **pages = NULL; 723 struct io_rsrc_node *node; 724 unsigned long off; 725 size_t size; 726 int ret, nr_pages, i; 727 struct io_imu_folio_data data; 728 bool coalesced = false; 729 730 if (!iov->iov_base) 731 return NULL; 732 733 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 734 if (!node) 735 return ERR_PTR(-ENOMEM); 736 node->buf = NULL; 737 738 ret = -ENOMEM; 739 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 740 &nr_pages); 741 if (IS_ERR(pages)) { 742 ret = PTR_ERR(pages); 743 pages = NULL; 744 goto done; 745 } 746 747 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 748 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 749 if (data.nr_pages_mid != 1) 750 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 751 } 752 753 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 754 if (!imu) 755 goto done; 756 757 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 758 if (ret) { 759 unpin_user_pages(pages, nr_pages); 760 goto done; 761 } 762 763 size = iov->iov_len; 764 /* store original address for later verification */ 765 imu->ubuf = (unsigned long) iov->iov_base; 766 imu->len = iov->iov_len; 767 imu->nr_bvecs = nr_pages; 768 imu->folio_shift = PAGE_SHIFT; 769 if (coalesced) 770 imu->folio_shift = data.folio_shift; 771 refcount_set(&imu->refs, 1); 772 off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); 773 node->buf = imu; 774 ret = 0; 775 776 for (i = 0; i < nr_pages; i++) { 777 size_t vec_len; 778 779 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 780 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 781 off = 0; 782 size -= vec_len; 783 } 784 done: 785 if (ret) { 786 kvfree(imu); 787 if (node) 788 io_put_rsrc_node(ctx, node); 789 node = ERR_PTR(ret); 790 } 791 kvfree(pages); 792 return node; 793 } 794 795 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 796 unsigned int nr_args, u64 __user *tags) 797 { 798 struct page *last_hpage = NULL; 799 struct io_rsrc_data data; 800 struct iovec fast_iov, *iov = &fast_iov; 801 const struct iovec __user *uvec; 802 int i, ret; 803 804 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 805 806 if (ctx->buf_table.nr) 807 return -EBUSY; 808 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 809 return -EINVAL; 810 ret = io_rsrc_data_alloc(&data, nr_args); 811 if (ret) 812 return ret; 813 814 if (!arg) 815 memset(iov, 0, sizeof(*iov)); 816 817 for (i = 0; i < nr_args; i++) { 818 struct io_rsrc_node *node; 819 u64 tag = 0; 820 821 if (arg) { 822 uvec = (struct iovec __user *) arg; 823 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 824 if (IS_ERR(iov)) { 825 ret = PTR_ERR(iov); 826 break; 827 } 828 ret = io_buffer_validate(iov); 829 if (ret) 830 break; 831 if (ctx->compat) 832 arg += sizeof(struct compat_iovec); 833 else 834 arg += sizeof(struct iovec); 835 } 836 837 if (tags) { 838 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 839 ret = -EFAULT; 840 break; 841 } 842 } 843 844 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 845 if (IS_ERR(node)) { 846 ret = PTR_ERR(node); 847 break; 848 } 849 if (tag) { 850 if (!node) { 851 ret = -EINVAL; 852 break; 853 } 854 node->tag = tag; 855 } 856 data.nodes[i] = node; 857 } 858 859 ctx->buf_table = data; 860 if (ret) 861 io_sqe_buffers_unregister(ctx); 862 return ret; 863 } 864 865 int io_import_fixed(int ddir, struct iov_iter *iter, 866 struct io_mapped_ubuf *imu, 867 u64 buf_addr, size_t len) 868 { 869 u64 buf_end; 870 size_t offset; 871 872 if (WARN_ON_ONCE(!imu)) 873 return -EFAULT; 874 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 875 return -EFAULT; 876 /* not inside the mapped region */ 877 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 878 return -EFAULT; 879 880 /* 881 * Might not be a start of buffer, set size appropriately 882 * and advance us to the beginning. 883 */ 884 offset = buf_addr - imu->ubuf; 885 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len); 886 887 if (offset) { 888 /* 889 * Don't use iov_iter_advance() here, as it's really slow for 890 * using the latter parts of a big fixed buffer - it iterates 891 * over each segment manually. We can cheat a bit here, because 892 * we know that: 893 * 894 * 1) it's a BVEC iter, we set it up 895 * 2) all bvecs are the same in size, except potentially the 896 * first and last bvec 897 * 898 * So just find our index, and adjust the iterator afterwards. 899 * If the offset is within the first bvec (or the whole first 900 * bvec, just use iov_iter_advance(). This makes it easier 901 * since we can just skip the first segment, which may not 902 * be folio_size aligned. 903 */ 904 const struct bio_vec *bvec = imu->bvec; 905 906 if (offset < bvec->bv_len) { 907 iter->iov_offset = offset; 908 } else { 909 unsigned long seg_skip; 910 911 /* skip first vec */ 912 offset -= bvec->bv_len; 913 seg_skip = 1 + (offset >> imu->folio_shift); 914 915 iter->bvec += seg_skip; 916 iter->nr_segs -= seg_skip; 917 iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); 918 } 919 } 920 921 return 0; 922 } 923 924 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 925 struct io_uring_clone_buffers *arg) 926 { 927 struct io_rsrc_data data; 928 int i, ret, off, nr; 929 unsigned int nbufs; 930 931 /* 932 * Accounting state is shared between the two rings; that only works if 933 * both rings are accounted towards the same counters. 934 */ 935 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 936 return -EINVAL; 937 938 /* if offsets are given, must have nr specified too */ 939 if (!arg->nr && (arg->dst_off || arg->src_off)) 940 return -EINVAL; 941 /* not allowed unless REPLACE is set */ 942 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 943 return -EBUSY; 944 945 nbufs = READ_ONCE(src_ctx->buf_table.nr); 946 if (!arg->nr) 947 arg->nr = nbufs; 948 else if (arg->nr > nbufs) 949 return -EINVAL; 950 else if (arg->nr > IORING_MAX_REG_BUFFERS) 951 return -EINVAL; 952 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 953 return -EOVERFLOW; 954 955 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 956 if (ret) 957 return ret; 958 959 /* Fill entries in data from dst that won't overlap with src */ 960 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 961 struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; 962 963 if (src_node) { 964 data.nodes[i] = src_node; 965 src_node->refs++; 966 } 967 } 968 969 /* 970 * Drop our own lock here. We'll setup the data we need and reference 971 * the source buffers, then re-grab, check, and assign at the end. 972 */ 973 mutex_unlock(&ctx->uring_lock); 974 975 mutex_lock(&src_ctx->uring_lock); 976 ret = -ENXIO; 977 nbufs = src_ctx->buf_table.nr; 978 if (!nbufs) 979 goto out_unlock; 980 ret = -EINVAL; 981 if (!arg->nr) 982 arg->nr = nbufs; 983 else if (arg->nr > nbufs) 984 goto out_unlock; 985 ret = -EOVERFLOW; 986 if (check_add_overflow(arg->nr, arg->src_off, &off)) 987 goto out_unlock; 988 if (off > nbufs) 989 goto out_unlock; 990 991 off = arg->dst_off; 992 i = arg->src_off; 993 nr = arg->nr; 994 while (nr--) { 995 struct io_rsrc_node *dst_node, *src_node; 996 997 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 998 if (!src_node) { 999 dst_node = NULL; 1000 } else { 1001 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1002 if (!dst_node) { 1003 ret = -ENOMEM; 1004 goto out_unlock; 1005 } 1006 1007 refcount_inc(&src_node->buf->refs); 1008 dst_node->buf = src_node->buf; 1009 } 1010 data.nodes[off++] = dst_node; 1011 i++; 1012 } 1013 1014 /* Have a ref on the bufs now, drop src lock and re-grab our own lock */ 1015 mutex_unlock(&src_ctx->uring_lock); 1016 mutex_lock(&ctx->uring_lock); 1017 1018 /* 1019 * If asked for replace, put the old table. data->nodes[] holds both 1020 * old and new nodes at this point. 1021 */ 1022 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1023 io_rsrc_data_free(ctx, &ctx->buf_table); 1024 1025 /* 1026 * ctx->buf_table should be empty now - either the contents are being 1027 * replaced and we just freed the table, or someone raced setting up 1028 * a buffer table while the clone was happening. If not empty, fall 1029 * through to failure handling. 1030 */ 1031 if (!ctx->buf_table.nr) { 1032 ctx->buf_table = data; 1033 return 0; 1034 } 1035 1036 mutex_unlock(&ctx->uring_lock); 1037 mutex_lock(&src_ctx->uring_lock); 1038 /* someone raced setting up buffers, dump ours */ 1039 ret = -EBUSY; 1040 out_unlock: 1041 io_rsrc_data_free(ctx, &data); 1042 mutex_unlock(&src_ctx->uring_lock); 1043 mutex_lock(&ctx->uring_lock); 1044 return ret; 1045 } 1046 1047 /* 1048 * Copy the registered buffers from the source ring whose file descriptor 1049 * is given in the src_fd to the current ring. This is identical to registering 1050 * the buffers with ctx, except faster as mappings already exist. 1051 * 1052 * Since the memory is already accounted once, don't account it again. 1053 */ 1054 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1055 { 1056 struct io_uring_clone_buffers buf; 1057 bool registered_src; 1058 struct file *file; 1059 int ret; 1060 1061 if (copy_from_user(&buf, arg, sizeof(buf))) 1062 return -EFAULT; 1063 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1064 return -EINVAL; 1065 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1066 return -EBUSY; 1067 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1068 return -EINVAL; 1069 1070 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1071 file = io_uring_register_get_file(buf.src_fd, registered_src); 1072 if (IS_ERR(file)) 1073 return PTR_ERR(file); 1074 ret = io_clone_buffers(ctx, file->private_data, &buf); 1075 if (!registered_src) 1076 fput(file); 1077 return ret; 1078 } 1079