1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 13 #include <uapi/linux/io_uring.h> 14 15 #include "io_uring.h" 16 #include "openclose.h" 17 #include "rsrc.h" 18 #include "memmap.h" 19 #include "register.h" 20 21 struct io_rsrc_update { 22 struct file *file; 23 u64 arg; 24 u32 nr_args; 25 u32 offset; 26 }; 27 28 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 29 struct iovec *iov, struct page **last_hpage); 30 31 /* only define max */ 32 #define IORING_MAX_FIXED_FILES (1U << 20) 33 #define IORING_MAX_REG_BUFFERS (1U << 14) 34 35 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 36 { 37 unsigned long page_limit, cur_pages, new_pages; 38 39 if (!nr_pages) 40 return 0; 41 42 /* Don't allow more pages than we can safely lock */ 43 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 44 45 cur_pages = atomic_long_read(&user->locked_vm); 46 do { 47 new_pages = cur_pages + nr_pages; 48 if (new_pages > page_limit) 49 return -ENOMEM; 50 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 51 &cur_pages, new_pages)); 52 return 0; 53 } 54 55 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 56 { 57 if (ctx->user) 58 __io_unaccount_mem(ctx->user, nr_pages); 59 60 if (ctx->mm_account) 61 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 62 } 63 64 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 65 { 66 int ret; 67 68 if (ctx->user) { 69 ret = __io_account_mem(ctx->user, nr_pages); 70 if (ret) 71 return ret; 72 } 73 74 if (ctx->mm_account) 75 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 76 77 return 0; 78 } 79 80 static int io_buffer_validate(struct iovec *iov) 81 { 82 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 83 84 /* 85 * Don't impose further limits on the size and buffer 86 * constraints here, we'll -EINVAL later when IO is 87 * submitted if they are wrong. 88 */ 89 if (!iov->iov_base) 90 return iov->iov_len ? -EFAULT : 0; 91 if (!iov->iov_len) 92 return -EFAULT; 93 94 /* arbitrary limit, but we need something */ 95 if (iov->iov_len > SZ_1G) 96 return -EFAULT; 97 98 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 99 return -EOVERFLOW; 100 101 return 0; 102 } 103 104 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 105 { 106 unsigned int i; 107 108 if (node->buf) { 109 struct io_mapped_ubuf *imu = node->buf; 110 111 if (!refcount_dec_and_test(&imu->refs)) 112 return; 113 for (i = 0; i < imu->nr_bvecs; i++) 114 unpin_user_page(imu->bvec[i].bv_page); 115 if (imu->acct_pages) 116 io_unaccount_mem(ctx, imu->acct_pages); 117 kvfree(imu); 118 } 119 } 120 121 struct io_rsrc_node *io_rsrc_node_alloc(int type) 122 { 123 struct io_rsrc_node *node; 124 125 node = kzalloc(sizeof(*node), GFP_KERNEL); 126 if (node) { 127 node->type = type; 128 node->refs = 1; 129 } 130 return node; 131 } 132 133 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) 134 { 135 if (!data->nr) 136 return; 137 while (data->nr--) { 138 if (data->nodes[data->nr]) 139 io_put_rsrc_node(ctx, data->nodes[data->nr]); 140 } 141 kvfree(data->nodes); 142 data->nodes = NULL; 143 data->nr = 0; 144 } 145 146 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 147 { 148 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), 149 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 150 if (data->nodes) { 151 data->nr = nr; 152 return 0; 153 } 154 return -ENOMEM; 155 } 156 157 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 158 struct io_uring_rsrc_update2 *up, 159 unsigned nr_args) 160 { 161 u64 __user *tags = u64_to_user_ptr(up->tags); 162 __s32 __user *fds = u64_to_user_ptr(up->data); 163 int fd, i, err = 0; 164 unsigned int done; 165 166 if (!ctx->file_table.data.nr) 167 return -ENXIO; 168 if (up->offset + nr_args > ctx->file_table.data.nr) 169 return -EINVAL; 170 171 for (done = 0; done < nr_args; done++) { 172 u64 tag = 0; 173 174 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 175 copy_from_user(&fd, &fds[done], sizeof(fd))) { 176 err = -EFAULT; 177 break; 178 } 179 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 180 err = -EINVAL; 181 break; 182 } 183 if (fd == IORING_REGISTER_FILES_SKIP) 184 continue; 185 186 i = up->offset + done; 187 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 188 io_file_bitmap_clear(&ctx->file_table, i); 189 190 if (fd != -1) { 191 struct file *file = fget(fd); 192 struct io_rsrc_node *node; 193 194 if (!file) { 195 err = -EBADF; 196 break; 197 } 198 /* 199 * Don't allow io_uring instances to be registered. 200 */ 201 if (io_is_uring_fops(file)) { 202 fput(file); 203 err = -EBADF; 204 break; 205 } 206 node = io_rsrc_node_alloc(IORING_RSRC_FILE); 207 if (!node) { 208 err = -ENOMEM; 209 fput(file); 210 break; 211 } 212 ctx->file_table.data.nodes[i] = node; 213 if (tag) 214 node->tag = tag; 215 io_fixed_file_set(node, file); 216 io_file_bitmap_set(&ctx->file_table, i); 217 } 218 } 219 return done ? done : err; 220 } 221 222 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 223 struct io_uring_rsrc_update2 *up, 224 unsigned int nr_args) 225 { 226 u64 __user *tags = u64_to_user_ptr(up->tags); 227 struct iovec fast_iov, *iov; 228 struct page *last_hpage = NULL; 229 struct iovec __user *uvec; 230 u64 user_data = up->data; 231 __u32 done; 232 int i, err; 233 234 if (!ctx->buf_table.nr) 235 return -ENXIO; 236 if (up->offset + nr_args > ctx->buf_table.nr) 237 return -EINVAL; 238 239 for (done = 0; done < nr_args; done++) { 240 struct io_rsrc_node *node; 241 u64 tag = 0; 242 243 uvec = u64_to_user_ptr(user_data); 244 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 245 if (IS_ERR(iov)) { 246 err = PTR_ERR(iov); 247 break; 248 } 249 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 250 err = -EFAULT; 251 break; 252 } 253 err = io_buffer_validate(iov); 254 if (err) 255 break; 256 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 257 if (IS_ERR(node)) { 258 err = PTR_ERR(node); 259 break; 260 } 261 if (tag) { 262 if (!node) { 263 err = -EINVAL; 264 break; 265 } 266 node->tag = tag; 267 } 268 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 269 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 270 ctx->buf_table.nodes[i] = node; 271 if (ctx->compat) 272 user_data += sizeof(struct compat_iovec); 273 else 274 user_data += sizeof(struct iovec); 275 } 276 return done ? done : err; 277 } 278 279 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 280 struct io_uring_rsrc_update2 *up, 281 unsigned nr_args) 282 { 283 __u32 tmp; 284 285 lockdep_assert_held(&ctx->uring_lock); 286 287 if (check_add_overflow(up->offset, nr_args, &tmp)) 288 return -EOVERFLOW; 289 290 switch (type) { 291 case IORING_RSRC_FILE: 292 return __io_sqe_files_update(ctx, up, nr_args); 293 case IORING_RSRC_BUFFER: 294 return __io_sqe_buffers_update(ctx, up, nr_args); 295 } 296 return -EINVAL; 297 } 298 299 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 300 unsigned nr_args) 301 { 302 struct io_uring_rsrc_update2 up; 303 304 if (!nr_args) 305 return -EINVAL; 306 memset(&up, 0, sizeof(up)); 307 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 308 return -EFAULT; 309 if (up.resv || up.resv2) 310 return -EINVAL; 311 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 312 } 313 314 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 315 unsigned size, unsigned type) 316 { 317 struct io_uring_rsrc_update2 up; 318 319 if (size != sizeof(up)) 320 return -EINVAL; 321 if (copy_from_user(&up, arg, sizeof(up))) 322 return -EFAULT; 323 if (!up.nr || up.resv || up.resv2) 324 return -EINVAL; 325 return __io_register_rsrc_update(ctx, type, &up, up.nr); 326 } 327 328 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 329 unsigned int size, unsigned int type) 330 { 331 struct io_uring_rsrc_register rr; 332 333 /* keep it extendible */ 334 if (size != sizeof(rr)) 335 return -EINVAL; 336 337 memset(&rr, 0, sizeof(rr)); 338 if (copy_from_user(&rr, arg, size)) 339 return -EFAULT; 340 if (!rr.nr || rr.resv2) 341 return -EINVAL; 342 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 343 return -EINVAL; 344 345 switch (type) { 346 case IORING_RSRC_FILE: 347 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 348 break; 349 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 350 rr.nr, u64_to_user_ptr(rr.tags)); 351 case IORING_RSRC_BUFFER: 352 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 353 break; 354 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 355 rr.nr, u64_to_user_ptr(rr.tags)); 356 } 357 return -EINVAL; 358 } 359 360 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 361 { 362 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 363 364 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 365 return -EINVAL; 366 if (sqe->rw_flags || sqe->splice_fd_in) 367 return -EINVAL; 368 369 up->offset = READ_ONCE(sqe->off); 370 up->nr_args = READ_ONCE(sqe->len); 371 if (!up->nr_args) 372 return -EINVAL; 373 up->arg = READ_ONCE(sqe->addr); 374 return 0; 375 } 376 377 static int io_files_update_with_index_alloc(struct io_kiocb *req, 378 unsigned int issue_flags) 379 { 380 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 381 __s32 __user *fds = u64_to_user_ptr(up->arg); 382 unsigned int done; 383 struct file *file; 384 int ret, fd; 385 386 if (!req->ctx->file_table.data.nr) 387 return -ENXIO; 388 389 for (done = 0; done < up->nr_args; done++) { 390 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 391 ret = -EFAULT; 392 break; 393 } 394 395 file = fget(fd); 396 if (!file) { 397 ret = -EBADF; 398 break; 399 } 400 ret = io_fixed_fd_install(req, issue_flags, file, 401 IORING_FILE_INDEX_ALLOC); 402 if (ret < 0) 403 break; 404 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 405 __io_close_fixed(req->ctx, issue_flags, ret); 406 ret = -EFAULT; 407 break; 408 } 409 } 410 411 if (done) 412 return done; 413 return ret; 414 } 415 416 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 417 { 418 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 419 struct io_ring_ctx *ctx = req->ctx; 420 struct io_uring_rsrc_update2 up2; 421 int ret; 422 423 up2.offset = up->offset; 424 up2.data = up->arg; 425 up2.nr = 0; 426 up2.tags = 0; 427 up2.resv = 0; 428 up2.resv2 = 0; 429 430 if (up->offset == IORING_FILE_INDEX_ALLOC) { 431 ret = io_files_update_with_index_alloc(req, issue_flags); 432 } else { 433 io_ring_submit_lock(ctx, issue_flags); 434 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 435 &up2, up->nr_args); 436 io_ring_submit_unlock(ctx, issue_flags); 437 } 438 439 if (ret < 0) 440 req_set_fail(req); 441 io_req_set_res(req, ret, 0); 442 return IOU_OK; 443 } 444 445 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 446 { 447 if (node->tag) 448 io_post_aux_cqe(ctx, node->tag, 0, 0); 449 450 switch (node->type) { 451 case IORING_RSRC_FILE: 452 if (io_slot_file(node)) 453 fput(io_slot_file(node)); 454 break; 455 case IORING_RSRC_BUFFER: 456 if (node->buf) 457 io_buffer_unmap(ctx, node); 458 break; 459 default: 460 WARN_ON_ONCE(1); 461 break; 462 } 463 464 kfree(node); 465 } 466 467 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 468 { 469 if (!ctx->file_table.data.nr) 470 return -ENXIO; 471 472 io_free_file_tables(ctx, &ctx->file_table); 473 io_file_table_set_alloc_range(ctx, 0, 0); 474 return 0; 475 } 476 477 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 478 unsigned nr_args, u64 __user *tags) 479 { 480 __s32 __user *fds = (__s32 __user *) arg; 481 struct file *file; 482 int fd, ret; 483 unsigned i; 484 485 if (ctx->file_table.data.nr) 486 return -EBUSY; 487 if (!nr_args) 488 return -EINVAL; 489 if (nr_args > IORING_MAX_FIXED_FILES) 490 return -EMFILE; 491 if (nr_args > rlimit(RLIMIT_NOFILE)) 492 return -EMFILE; 493 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 494 return -ENOMEM; 495 496 for (i = 0; i < nr_args; i++) { 497 struct io_rsrc_node *node; 498 u64 tag = 0; 499 500 ret = -EFAULT; 501 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 502 goto fail; 503 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 504 goto fail; 505 /* allow sparse sets */ 506 if (!fds || fd == -1) { 507 ret = -EINVAL; 508 if (tag) 509 goto fail; 510 continue; 511 } 512 513 file = fget(fd); 514 ret = -EBADF; 515 if (unlikely(!file)) 516 goto fail; 517 518 /* 519 * Don't allow io_uring instances to be registered. 520 */ 521 if (io_is_uring_fops(file)) { 522 fput(file); 523 goto fail; 524 } 525 ret = -ENOMEM; 526 node = io_rsrc_node_alloc(IORING_RSRC_FILE); 527 if (!node) { 528 fput(file); 529 goto fail; 530 } 531 if (tag) 532 node->tag = tag; 533 ctx->file_table.data.nodes[i] = node; 534 io_fixed_file_set(node, file); 535 io_file_bitmap_set(&ctx->file_table, i); 536 } 537 538 /* default it to the whole table */ 539 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 540 return 0; 541 fail: 542 io_sqe_files_unregister(ctx); 543 return ret; 544 } 545 546 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 547 { 548 if (!ctx->buf_table.nr) 549 return -ENXIO; 550 io_rsrc_data_free(ctx, &ctx->buf_table); 551 return 0; 552 } 553 554 /* 555 * Not super efficient, but this is just a registration time. And we do cache 556 * the last compound head, so generally we'll only do a full search if we don't 557 * match that one. 558 * 559 * We check if the given compound head page has already been accounted, to 560 * avoid double accounting it. This allows us to account the full size of the 561 * page, not just the constituent pages of a huge page. 562 */ 563 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 564 int nr_pages, struct page *hpage) 565 { 566 int i, j; 567 568 /* check current page array */ 569 for (i = 0; i < nr_pages; i++) { 570 if (!PageCompound(pages[i])) 571 continue; 572 if (compound_head(pages[i]) == hpage) 573 return true; 574 } 575 576 /* check previously registered pages */ 577 for (i = 0; i < ctx->buf_table.nr; i++) { 578 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 579 struct io_mapped_ubuf *imu; 580 581 if (!node) 582 continue; 583 imu = node->buf; 584 for (j = 0; j < imu->nr_bvecs; j++) { 585 if (!PageCompound(imu->bvec[j].bv_page)) 586 continue; 587 if (compound_head(imu->bvec[j].bv_page) == hpage) 588 return true; 589 } 590 } 591 592 return false; 593 } 594 595 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 596 int nr_pages, struct io_mapped_ubuf *imu, 597 struct page **last_hpage) 598 { 599 int i, ret; 600 601 imu->acct_pages = 0; 602 for (i = 0; i < nr_pages; i++) { 603 if (!PageCompound(pages[i])) { 604 imu->acct_pages++; 605 } else { 606 struct page *hpage; 607 608 hpage = compound_head(pages[i]); 609 if (hpage == *last_hpage) 610 continue; 611 *last_hpage = hpage; 612 if (headpage_already_acct(ctx, pages, i, hpage)) 613 continue; 614 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 615 } 616 } 617 618 if (!imu->acct_pages) 619 return 0; 620 621 ret = io_account_mem(ctx, imu->acct_pages); 622 if (ret) 623 imu->acct_pages = 0; 624 return ret; 625 } 626 627 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 628 struct io_imu_folio_data *data) 629 { 630 struct page **page_array = *pages, **new_array = NULL; 631 int nr_pages_left = *nr_pages, i, j; 632 int nr_folios = data->nr_folios; 633 634 /* Store head pages only*/ 635 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), 636 GFP_KERNEL); 637 if (!new_array) 638 return false; 639 640 new_array[0] = compound_head(page_array[0]); 641 /* 642 * The pages are bound to the folio, it doesn't 643 * actually unpin them but drops all but one reference, 644 * which is usually put down by io_buffer_unmap(). 645 * Note, needs a better helper. 646 */ 647 if (data->nr_pages_head > 1) 648 unpin_user_pages(&page_array[1], data->nr_pages_head - 1); 649 650 j = data->nr_pages_head; 651 nr_pages_left -= data->nr_pages_head; 652 for (i = 1; i < nr_folios; i++) { 653 unsigned int nr_unpin; 654 655 new_array[i] = page_array[j]; 656 nr_unpin = min_t(unsigned int, nr_pages_left - 1, 657 data->nr_pages_mid - 1); 658 if (nr_unpin) 659 unpin_user_pages(&page_array[j+1], nr_unpin); 660 j += data->nr_pages_mid; 661 nr_pages_left -= data->nr_pages_mid; 662 } 663 kvfree(page_array); 664 *pages = new_array; 665 *nr_pages = nr_folios; 666 return true; 667 } 668 669 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 670 struct io_imu_folio_data *data) 671 { 672 struct folio *folio = page_folio(page_array[0]); 673 unsigned int count = 1, nr_folios = 1; 674 int i; 675 676 data->nr_pages_mid = folio_nr_pages(folio); 677 data->folio_shift = folio_shift(folio); 678 679 /* 680 * Check if pages are contiguous inside a folio, and all folios have 681 * the same page count except for the head and tail. 682 */ 683 for (i = 1; i < nr_pages; i++) { 684 if (page_folio(page_array[i]) == folio && 685 page_array[i] == page_array[i-1] + 1) { 686 count++; 687 continue; 688 } 689 690 if (nr_folios == 1) { 691 if (folio_page_idx(folio, page_array[i-1]) != 692 data->nr_pages_mid - 1) 693 return false; 694 695 data->nr_pages_head = count; 696 } else if (count != data->nr_pages_mid) { 697 return false; 698 } 699 700 folio = page_folio(page_array[i]); 701 if (folio_size(folio) != (1UL << data->folio_shift) || 702 folio_page_idx(folio, page_array[i]) != 0) 703 return false; 704 705 count = 1; 706 nr_folios++; 707 } 708 if (nr_folios == 1) 709 data->nr_pages_head = count; 710 711 data->nr_folios = nr_folios; 712 return true; 713 } 714 715 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 716 struct iovec *iov, 717 struct page **last_hpage) 718 { 719 struct io_mapped_ubuf *imu = NULL; 720 struct page **pages = NULL; 721 struct io_rsrc_node *node; 722 unsigned long off; 723 size_t size; 724 int ret, nr_pages, i; 725 struct io_imu_folio_data data; 726 bool coalesced = false; 727 728 if (!iov->iov_base) 729 return NULL; 730 731 node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); 732 if (!node) 733 return ERR_PTR(-ENOMEM); 734 node->buf = NULL; 735 736 ret = -ENOMEM; 737 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 738 &nr_pages); 739 if (IS_ERR(pages)) { 740 ret = PTR_ERR(pages); 741 pages = NULL; 742 goto done; 743 } 744 745 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 746 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 747 if (data.nr_pages_mid != 1) 748 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 749 } 750 751 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 752 if (!imu) 753 goto done; 754 755 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 756 if (ret) { 757 unpin_user_pages(pages, nr_pages); 758 goto done; 759 } 760 761 size = iov->iov_len; 762 /* store original address for later verification */ 763 imu->ubuf = (unsigned long) iov->iov_base; 764 imu->len = iov->iov_len; 765 imu->nr_bvecs = nr_pages; 766 imu->folio_shift = PAGE_SHIFT; 767 if (coalesced) 768 imu->folio_shift = data.folio_shift; 769 refcount_set(&imu->refs, 1); 770 off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); 771 node->buf = imu; 772 ret = 0; 773 774 for (i = 0; i < nr_pages; i++) { 775 size_t vec_len; 776 777 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 778 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 779 off = 0; 780 size -= vec_len; 781 } 782 done: 783 if (ret) { 784 kvfree(imu); 785 if (node) 786 io_put_rsrc_node(ctx, node); 787 node = ERR_PTR(ret); 788 } 789 kvfree(pages); 790 return node; 791 } 792 793 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 794 unsigned int nr_args, u64 __user *tags) 795 { 796 struct page *last_hpage = NULL; 797 struct io_rsrc_data data; 798 struct iovec fast_iov, *iov = &fast_iov; 799 const struct iovec __user *uvec; 800 int i, ret; 801 802 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 803 804 if (ctx->buf_table.nr) 805 return -EBUSY; 806 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 807 return -EINVAL; 808 ret = io_rsrc_data_alloc(&data, nr_args); 809 if (ret) 810 return ret; 811 812 if (!arg) 813 memset(iov, 0, sizeof(*iov)); 814 815 for (i = 0; i < nr_args; i++) { 816 struct io_rsrc_node *node; 817 u64 tag = 0; 818 819 if (arg) { 820 uvec = (struct iovec __user *) arg; 821 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 822 if (IS_ERR(iov)) { 823 ret = PTR_ERR(iov); 824 break; 825 } 826 ret = io_buffer_validate(iov); 827 if (ret) 828 break; 829 if (ctx->compat) 830 arg += sizeof(struct compat_iovec); 831 else 832 arg += sizeof(struct iovec); 833 } 834 835 if (tags) { 836 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 837 ret = -EFAULT; 838 break; 839 } 840 } 841 842 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 843 if (IS_ERR(node)) { 844 ret = PTR_ERR(node); 845 break; 846 } 847 if (tag) { 848 if (!node) { 849 ret = -EINVAL; 850 break; 851 } 852 node->tag = tag; 853 } 854 data.nodes[i] = node; 855 } 856 857 ctx->buf_table = data; 858 if (ret) 859 io_sqe_buffers_unregister(ctx); 860 return ret; 861 } 862 863 int io_import_fixed(int ddir, struct iov_iter *iter, 864 struct io_mapped_ubuf *imu, 865 u64 buf_addr, size_t len) 866 { 867 u64 buf_end; 868 size_t offset; 869 870 if (WARN_ON_ONCE(!imu)) 871 return -EFAULT; 872 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 873 return -EFAULT; 874 /* not inside the mapped region */ 875 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 876 return -EFAULT; 877 878 /* 879 * Might not be a start of buffer, set size appropriately 880 * and advance us to the beginning. 881 */ 882 offset = buf_addr - imu->ubuf; 883 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len); 884 885 if (offset) { 886 /* 887 * Don't use iov_iter_advance() here, as it's really slow for 888 * using the latter parts of a big fixed buffer - it iterates 889 * over each segment manually. We can cheat a bit here, because 890 * we know that: 891 * 892 * 1) it's a BVEC iter, we set it up 893 * 2) all bvecs are the same in size, except potentially the 894 * first and last bvec 895 * 896 * So just find our index, and adjust the iterator afterwards. 897 * If the offset is within the first bvec (or the whole first 898 * bvec, just use iov_iter_advance(). This makes it easier 899 * since we can just skip the first segment, which may not 900 * be folio_size aligned. 901 */ 902 const struct bio_vec *bvec = imu->bvec; 903 904 if (offset < bvec->bv_len) { 905 iter->iov_offset = offset; 906 } else { 907 unsigned long seg_skip; 908 909 /* skip first vec */ 910 offset -= bvec->bv_len; 911 seg_skip = 1 + (offset >> imu->folio_shift); 912 913 iter->bvec += seg_skip; 914 iter->nr_segs -= seg_skip; 915 iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); 916 } 917 } 918 919 return 0; 920 } 921 922 /* Lock two rings at once. The rings must be different! */ 923 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 924 { 925 if (ctx1 > ctx2) 926 swap(ctx1, ctx2); 927 mutex_lock(&ctx1->uring_lock); 928 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 929 } 930 931 /* Both rings are locked by the caller. */ 932 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 933 struct io_uring_clone_buffers *arg) 934 { 935 struct io_rsrc_data data; 936 int i, ret, off, nr; 937 unsigned int nbufs; 938 939 lockdep_assert_held(&ctx->uring_lock); 940 lockdep_assert_held(&src_ctx->uring_lock); 941 942 /* 943 * Accounting state is shared between the two rings; that only works if 944 * both rings are accounted towards the same counters. 945 */ 946 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 947 return -EINVAL; 948 949 /* if offsets are given, must have nr specified too */ 950 if (!arg->nr && (arg->dst_off || arg->src_off)) 951 return -EINVAL; 952 /* not allowed unless REPLACE is set */ 953 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 954 return -EBUSY; 955 956 nbufs = src_ctx->buf_table.nr; 957 if (!arg->nr) 958 arg->nr = nbufs; 959 else if (arg->nr > nbufs) 960 return -EINVAL; 961 else if (arg->nr > IORING_MAX_REG_BUFFERS) 962 return -EINVAL; 963 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 964 return -EOVERFLOW; 965 966 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 967 if (ret) 968 return ret; 969 970 /* Fill entries in data from dst that won't overlap with src */ 971 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 972 struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; 973 974 if (src_node) { 975 data.nodes[i] = src_node; 976 src_node->refs++; 977 } 978 } 979 980 ret = -ENXIO; 981 nbufs = src_ctx->buf_table.nr; 982 if (!nbufs) 983 goto out_free; 984 ret = -EINVAL; 985 if (!arg->nr) 986 arg->nr = nbufs; 987 else if (arg->nr > nbufs) 988 goto out_free; 989 ret = -EOVERFLOW; 990 if (check_add_overflow(arg->nr, arg->src_off, &off)) 991 goto out_free; 992 if (off > nbufs) 993 goto out_free; 994 995 off = arg->dst_off; 996 i = arg->src_off; 997 nr = arg->nr; 998 while (nr--) { 999 struct io_rsrc_node *dst_node, *src_node; 1000 1001 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1002 if (!src_node) { 1003 dst_node = NULL; 1004 } else { 1005 dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); 1006 if (!dst_node) { 1007 ret = -ENOMEM; 1008 goto out_free; 1009 } 1010 1011 refcount_inc(&src_node->buf->refs); 1012 dst_node->buf = src_node->buf; 1013 } 1014 data.nodes[off++] = dst_node; 1015 i++; 1016 } 1017 1018 /* 1019 * If asked for replace, put the old table. data->nodes[] holds both 1020 * old and new nodes at this point. 1021 */ 1022 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1023 io_rsrc_data_free(ctx, &ctx->buf_table); 1024 1025 /* 1026 * ctx->buf_table must be empty now - either the contents are being 1027 * replaced and we just freed the table, or the contents are being 1028 * copied to a ring that does not have buffers yet (checked at function 1029 * entry). 1030 */ 1031 WARN_ON_ONCE(ctx->buf_table.nr); 1032 ctx->buf_table = data; 1033 return 0; 1034 1035 out_free: 1036 io_rsrc_data_free(ctx, &data); 1037 return ret; 1038 } 1039 1040 /* 1041 * Copy the registered buffers from the source ring whose file descriptor 1042 * is given in the src_fd to the current ring. This is identical to registering 1043 * the buffers with ctx, except faster as mappings already exist. 1044 * 1045 * Since the memory is already accounted once, don't account it again. 1046 */ 1047 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1048 { 1049 struct io_uring_clone_buffers buf; 1050 struct io_ring_ctx *src_ctx; 1051 bool registered_src; 1052 struct file *file; 1053 int ret; 1054 1055 if (copy_from_user(&buf, arg, sizeof(buf))) 1056 return -EFAULT; 1057 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1058 return -EINVAL; 1059 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1060 return -EBUSY; 1061 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1062 return -EINVAL; 1063 1064 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1065 file = io_uring_register_get_file(buf.src_fd, registered_src); 1066 if (IS_ERR(file)) 1067 return PTR_ERR(file); 1068 1069 src_ctx = file->private_data; 1070 if (src_ctx != ctx) { 1071 mutex_unlock(&ctx->uring_lock); 1072 lock_two_rings(ctx, src_ctx); 1073 } 1074 1075 ret = io_clone_buffers(ctx, src_ctx, &buf); 1076 1077 if (src_ctx != ctx) 1078 mutex_unlock(&src_ctx->uring_lock); 1079 1080 fput(file); 1081 return ret; 1082 } 1083