1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 13 #include <uapi/linux/io_uring.h> 14 15 #include "io_uring.h" 16 #include "openclose.h" 17 #include "rsrc.h" 18 19 struct io_rsrc_update { 20 struct file *file; 21 u64 arg; 22 u32 nr_args; 23 u32 offset; 24 }; 25 26 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 27 struct io_mapped_ubuf **pimu, 28 struct page **last_hpage); 29 30 #define IO_RSRC_REF_BATCH 100 31 32 /* only define max */ 33 #define IORING_MAX_FIXED_FILES (1U << 20) 34 #define IORING_MAX_REG_BUFFERS (1U << 14) 35 36 void io_rsrc_refs_drop(struct io_ring_ctx *ctx) 37 __must_hold(&ctx->uring_lock) 38 { 39 if (ctx->rsrc_cached_refs) { 40 io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); 41 ctx->rsrc_cached_refs = 0; 42 } 43 } 44 45 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 46 { 47 unsigned long page_limit, cur_pages, new_pages; 48 49 if (!nr_pages) 50 return 0; 51 52 /* Don't allow more pages than we can safely lock */ 53 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 54 55 cur_pages = atomic_long_read(&user->locked_vm); 56 do { 57 new_pages = cur_pages + nr_pages; 58 if (new_pages > page_limit) 59 return -ENOMEM; 60 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 61 &cur_pages, new_pages)); 62 return 0; 63 } 64 65 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 66 { 67 if (ctx->user) 68 __io_unaccount_mem(ctx->user, nr_pages); 69 70 if (ctx->mm_account) 71 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 72 } 73 74 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 75 { 76 int ret; 77 78 if (ctx->user) { 79 ret = __io_account_mem(ctx->user, nr_pages); 80 if (ret) 81 return ret; 82 } 83 84 if (ctx->mm_account) 85 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 86 87 return 0; 88 } 89 90 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 91 void __user *arg, unsigned index) 92 { 93 struct iovec __user *src; 94 95 #ifdef CONFIG_COMPAT 96 if (ctx->compat) { 97 struct compat_iovec __user *ciovs; 98 struct compat_iovec ciov; 99 100 ciovs = (struct compat_iovec __user *) arg; 101 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 102 return -EFAULT; 103 104 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 105 dst->iov_len = ciov.iov_len; 106 return 0; 107 } 108 #endif 109 src = (struct iovec __user *) arg; 110 if (copy_from_user(dst, &src[index], sizeof(*dst))) 111 return -EFAULT; 112 return 0; 113 } 114 115 static int io_buffer_validate(struct iovec *iov) 116 { 117 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 118 119 /* 120 * Don't impose further limits on the size and buffer 121 * constraints here, we'll -EINVAL later when IO is 122 * submitted if they are wrong. 123 */ 124 if (!iov->iov_base) 125 return iov->iov_len ? -EFAULT : 0; 126 if (!iov->iov_len) 127 return -EFAULT; 128 129 /* arbitrary limit, but we need something */ 130 if (iov->iov_len > SZ_1G) 131 return -EFAULT; 132 133 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 134 return -EOVERFLOW; 135 136 return 0; 137 } 138 139 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 140 { 141 struct io_mapped_ubuf *imu = *slot; 142 unsigned int i; 143 144 if (imu != ctx->dummy_ubuf) { 145 for (i = 0; i < imu->nr_bvecs; i++) 146 unpin_user_page(imu->bvec[i].bv_page); 147 if (imu->acct_pages) 148 io_unaccount_mem(ctx, imu->acct_pages); 149 kvfree(imu); 150 } 151 *slot = NULL; 152 } 153 154 void io_rsrc_refs_refill(struct io_ring_ctx *ctx) 155 __must_hold(&ctx->uring_lock) 156 { 157 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; 158 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); 159 } 160 161 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 162 { 163 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 164 struct io_ring_ctx *ctx = rsrc_data->ctx; 165 struct io_rsrc_put *prsrc, *tmp; 166 167 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { 168 list_del(&prsrc->list); 169 170 if (prsrc->tag) { 171 if (ctx->flags & IORING_SETUP_IOPOLL) { 172 mutex_lock(&ctx->uring_lock); 173 io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true); 174 mutex_unlock(&ctx->uring_lock); 175 } else { 176 io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true); 177 } 178 } 179 180 rsrc_data->do_put(ctx, prsrc); 181 kfree(prsrc); 182 } 183 184 io_rsrc_node_destroy(ref_node); 185 if (atomic_dec_and_test(&rsrc_data->refs)) 186 complete(&rsrc_data->done); 187 } 188 189 void io_rsrc_put_work(struct work_struct *work) 190 { 191 struct io_ring_ctx *ctx; 192 struct llist_node *node; 193 194 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); 195 node = llist_del_all(&ctx->rsrc_put_llist); 196 197 while (node) { 198 struct io_rsrc_node *ref_node; 199 struct llist_node *next = node->next; 200 201 ref_node = llist_entry(node, struct io_rsrc_node, llist); 202 __io_rsrc_put_work(ref_node); 203 node = next; 204 } 205 } 206 207 void io_wait_rsrc_data(struct io_rsrc_data *data) 208 { 209 if (data && !atomic_dec_and_test(&data->refs)) 210 wait_for_completion(&data->done); 211 } 212 213 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) 214 { 215 percpu_ref_exit(&ref_node->refs); 216 kfree(ref_node); 217 } 218 219 static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) 220 { 221 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); 222 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 223 unsigned long flags; 224 bool first_add = false; 225 unsigned long delay = HZ; 226 227 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); 228 node->done = true; 229 230 /* if we are mid-quiesce then do not delay */ 231 if (node->rsrc_data->quiesce) 232 delay = 0; 233 234 while (!list_empty(&ctx->rsrc_ref_list)) { 235 node = list_first_entry(&ctx->rsrc_ref_list, 236 struct io_rsrc_node, node); 237 /* recycle ref nodes in order */ 238 if (!node->done) 239 break; 240 list_del(&node->node); 241 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); 242 } 243 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); 244 245 if (first_add) 246 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); 247 } 248 249 static struct io_rsrc_node *io_rsrc_node_alloc(void) 250 { 251 struct io_rsrc_node *ref_node; 252 253 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 254 if (!ref_node) 255 return NULL; 256 257 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, 258 0, GFP_KERNEL)) { 259 kfree(ref_node); 260 return NULL; 261 } 262 INIT_LIST_HEAD(&ref_node->node); 263 INIT_LIST_HEAD(&ref_node->rsrc_list); 264 ref_node->done = false; 265 return ref_node; 266 } 267 268 void io_rsrc_node_switch(struct io_ring_ctx *ctx, 269 struct io_rsrc_data *data_to_kill) 270 __must_hold(&ctx->uring_lock) 271 { 272 WARN_ON_ONCE(!ctx->rsrc_backup_node); 273 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); 274 275 io_rsrc_refs_drop(ctx); 276 277 if (data_to_kill) { 278 struct io_rsrc_node *rsrc_node = ctx->rsrc_node; 279 280 rsrc_node->rsrc_data = data_to_kill; 281 spin_lock_irq(&ctx->rsrc_ref_lock); 282 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); 283 spin_unlock_irq(&ctx->rsrc_ref_lock); 284 285 atomic_inc(&data_to_kill->refs); 286 percpu_ref_kill(&rsrc_node->refs); 287 ctx->rsrc_node = NULL; 288 } 289 290 if (!ctx->rsrc_node) { 291 ctx->rsrc_node = ctx->rsrc_backup_node; 292 ctx->rsrc_backup_node = NULL; 293 } 294 } 295 296 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 297 { 298 if (ctx->rsrc_backup_node) 299 return 0; 300 ctx->rsrc_backup_node = io_rsrc_node_alloc(); 301 return ctx->rsrc_backup_node ? 0 : -ENOMEM; 302 } 303 304 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, 305 struct io_ring_ctx *ctx) 306 { 307 int ret; 308 309 /* As we may drop ->uring_lock, other task may have started quiesce */ 310 if (data->quiesce) 311 return -ENXIO; 312 313 data->quiesce = true; 314 do { 315 ret = io_rsrc_node_switch_start(ctx); 316 if (ret) 317 break; 318 io_rsrc_node_switch(ctx, data); 319 320 /* kill initial ref, already quiesced if zero */ 321 if (atomic_dec_and_test(&data->refs)) 322 break; 323 mutex_unlock(&ctx->uring_lock); 324 flush_delayed_work(&ctx->rsrc_put_work); 325 ret = wait_for_completion_interruptible(&data->done); 326 if (!ret) { 327 mutex_lock(&ctx->uring_lock); 328 if (atomic_read(&data->refs) > 0) { 329 /* 330 * it has been revived by another thread while 331 * we were unlocked 332 */ 333 mutex_unlock(&ctx->uring_lock); 334 } else { 335 break; 336 } 337 } 338 339 atomic_inc(&data->refs); 340 /* wait for all works potentially completing data->done */ 341 flush_delayed_work(&ctx->rsrc_put_work); 342 reinit_completion(&data->done); 343 344 ret = io_run_task_work_sig(); 345 mutex_lock(&ctx->uring_lock); 346 } while (ret >= 0); 347 data->quiesce = false; 348 349 return ret; 350 } 351 352 static void io_free_page_table(void **table, size_t size) 353 { 354 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 355 356 for (i = 0; i < nr_tables; i++) 357 kfree(table[i]); 358 kfree(table); 359 } 360 361 static void io_rsrc_data_free(struct io_rsrc_data *data) 362 { 363 size_t size = data->nr * sizeof(data->tags[0][0]); 364 365 if (data->tags) 366 io_free_page_table((void **)data->tags, size); 367 kfree(data); 368 } 369 370 static __cold void **io_alloc_page_table(size_t size) 371 { 372 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 373 size_t init_size = size; 374 void **table; 375 376 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 377 if (!table) 378 return NULL; 379 380 for (i = 0; i < nr_tables; i++) { 381 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 382 383 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 384 if (!table[i]) { 385 io_free_page_table(table, init_size); 386 return NULL; 387 } 388 size -= this_size; 389 } 390 return table; 391 } 392 393 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, 394 rsrc_put_fn *do_put, u64 __user *utags, 395 unsigned nr, struct io_rsrc_data **pdata) 396 { 397 struct io_rsrc_data *data; 398 int ret = -ENOMEM; 399 unsigned i; 400 401 data = kzalloc(sizeof(*data), GFP_KERNEL); 402 if (!data) 403 return -ENOMEM; 404 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 405 if (!data->tags) { 406 kfree(data); 407 return -ENOMEM; 408 } 409 410 data->nr = nr; 411 data->ctx = ctx; 412 data->do_put = do_put; 413 if (utags) { 414 ret = -EFAULT; 415 for (i = 0; i < nr; i++) { 416 u64 *tag_slot = io_get_tag_slot(data, i); 417 418 if (copy_from_user(tag_slot, &utags[i], 419 sizeof(*tag_slot))) 420 goto fail; 421 } 422 } 423 424 atomic_set(&data->refs, 1); 425 init_completion(&data->done); 426 *pdata = data; 427 return 0; 428 fail: 429 io_rsrc_data_free(data); 430 return ret; 431 } 432 433 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 434 struct io_uring_rsrc_update2 *up, 435 unsigned nr_args) 436 { 437 u64 __user *tags = u64_to_user_ptr(up->tags); 438 __s32 __user *fds = u64_to_user_ptr(up->data); 439 struct io_rsrc_data *data = ctx->file_data; 440 struct io_fixed_file *file_slot; 441 struct file *file; 442 int fd, i, err = 0; 443 unsigned int done; 444 bool needs_switch = false; 445 446 if (!ctx->file_data) 447 return -ENXIO; 448 if (up->offset + nr_args > ctx->nr_user_files) 449 return -EINVAL; 450 451 for (done = 0; done < nr_args; done++) { 452 u64 tag = 0; 453 454 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 455 copy_from_user(&fd, &fds[done], sizeof(fd))) { 456 err = -EFAULT; 457 break; 458 } 459 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 460 err = -EINVAL; 461 break; 462 } 463 if (fd == IORING_REGISTER_FILES_SKIP) 464 continue; 465 466 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 467 file_slot = io_fixed_file_slot(&ctx->file_table, i); 468 469 if (file_slot->file_ptr) { 470 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 471 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); 472 if (err) 473 break; 474 file_slot->file_ptr = 0; 475 io_file_bitmap_clear(&ctx->file_table, i); 476 needs_switch = true; 477 } 478 if (fd != -1) { 479 file = fget(fd); 480 if (!file) { 481 err = -EBADF; 482 break; 483 } 484 /* 485 * Don't allow io_uring instances to be registered. If 486 * UNIX isn't enabled, then this causes a reference 487 * cycle and this instance can never get freed. If UNIX 488 * is enabled we'll handle it just fine, but there's 489 * still no point in allowing a ring fd as it doesn't 490 * support regular read/write anyway. 491 */ 492 if (io_is_uring_fops(file)) { 493 fput(file); 494 err = -EBADF; 495 break; 496 } 497 err = io_scm_file_account(ctx, file); 498 if (err) { 499 fput(file); 500 break; 501 } 502 *io_get_tag_slot(data, i) = tag; 503 io_fixed_file_set(file_slot, file); 504 io_file_bitmap_set(&ctx->file_table, i); 505 } 506 } 507 508 if (needs_switch) 509 io_rsrc_node_switch(ctx, data); 510 return done ? done : err; 511 } 512 513 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 514 struct io_uring_rsrc_update2 *up, 515 unsigned int nr_args) 516 { 517 u64 __user *tags = u64_to_user_ptr(up->tags); 518 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 519 struct page *last_hpage = NULL; 520 bool needs_switch = false; 521 __u32 done; 522 int i, err; 523 524 if (!ctx->buf_data) 525 return -ENXIO; 526 if (up->offset + nr_args > ctx->nr_user_bufs) 527 return -EINVAL; 528 529 for (done = 0; done < nr_args; done++) { 530 struct io_mapped_ubuf *imu; 531 int offset = up->offset + done; 532 u64 tag = 0; 533 534 err = io_copy_iov(ctx, &iov, iovs, done); 535 if (err) 536 break; 537 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 538 err = -EFAULT; 539 break; 540 } 541 err = io_buffer_validate(&iov); 542 if (err) 543 break; 544 if (!iov.iov_base && tag) { 545 err = -EINVAL; 546 break; 547 } 548 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 549 if (err) 550 break; 551 552 i = array_index_nospec(offset, ctx->nr_user_bufs); 553 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 554 err = io_queue_rsrc_removal(ctx->buf_data, i, 555 ctx->rsrc_node, ctx->user_bufs[i]); 556 if (unlikely(err)) { 557 io_buffer_unmap(ctx, &imu); 558 break; 559 } 560 ctx->user_bufs[i] = ctx->dummy_ubuf; 561 needs_switch = true; 562 } 563 564 ctx->user_bufs[i] = imu; 565 *io_get_tag_slot(ctx->buf_data, offset) = tag; 566 } 567 568 if (needs_switch) 569 io_rsrc_node_switch(ctx, ctx->buf_data); 570 return done ? done : err; 571 } 572 573 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 574 struct io_uring_rsrc_update2 *up, 575 unsigned nr_args) 576 { 577 __u32 tmp; 578 int err; 579 580 if (check_add_overflow(up->offset, nr_args, &tmp)) 581 return -EOVERFLOW; 582 err = io_rsrc_node_switch_start(ctx); 583 if (err) 584 return err; 585 586 switch (type) { 587 case IORING_RSRC_FILE: 588 return __io_sqe_files_update(ctx, up, nr_args); 589 case IORING_RSRC_BUFFER: 590 return __io_sqe_buffers_update(ctx, up, nr_args); 591 } 592 return -EINVAL; 593 } 594 595 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 596 unsigned nr_args) 597 { 598 struct io_uring_rsrc_update2 up; 599 600 if (!nr_args) 601 return -EINVAL; 602 memset(&up, 0, sizeof(up)); 603 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 604 return -EFAULT; 605 if (up.resv || up.resv2) 606 return -EINVAL; 607 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 608 } 609 610 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 611 unsigned size, unsigned type) 612 { 613 struct io_uring_rsrc_update2 up; 614 615 if (size != sizeof(up)) 616 return -EINVAL; 617 if (copy_from_user(&up, arg, sizeof(up))) 618 return -EFAULT; 619 if (!up.nr || up.resv || up.resv2) 620 return -EINVAL; 621 return __io_register_rsrc_update(ctx, type, &up, up.nr); 622 } 623 624 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 625 unsigned int size, unsigned int type) 626 { 627 struct io_uring_rsrc_register rr; 628 629 /* keep it extendible */ 630 if (size != sizeof(rr)) 631 return -EINVAL; 632 633 memset(&rr, 0, sizeof(rr)); 634 if (copy_from_user(&rr, arg, size)) 635 return -EFAULT; 636 if (!rr.nr || rr.resv2) 637 return -EINVAL; 638 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 639 return -EINVAL; 640 641 switch (type) { 642 case IORING_RSRC_FILE: 643 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 644 break; 645 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 646 rr.nr, u64_to_user_ptr(rr.tags)); 647 case IORING_RSRC_BUFFER: 648 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 649 break; 650 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 651 rr.nr, u64_to_user_ptr(rr.tags)); 652 } 653 return -EINVAL; 654 } 655 656 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 657 { 658 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 659 660 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 661 return -EINVAL; 662 if (sqe->rw_flags || sqe->splice_fd_in) 663 return -EINVAL; 664 665 up->offset = READ_ONCE(sqe->off); 666 up->nr_args = READ_ONCE(sqe->len); 667 if (!up->nr_args) 668 return -EINVAL; 669 up->arg = READ_ONCE(sqe->addr); 670 return 0; 671 } 672 673 static int io_files_update_with_index_alloc(struct io_kiocb *req, 674 unsigned int issue_flags) 675 { 676 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 677 __s32 __user *fds = u64_to_user_ptr(up->arg); 678 unsigned int done; 679 struct file *file; 680 int ret, fd; 681 682 if (!req->ctx->file_data) 683 return -ENXIO; 684 685 for (done = 0; done < up->nr_args; done++) { 686 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 687 ret = -EFAULT; 688 break; 689 } 690 691 file = fget(fd); 692 if (!file) { 693 ret = -EBADF; 694 break; 695 } 696 ret = io_fixed_fd_install(req, issue_flags, file, 697 IORING_FILE_INDEX_ALLOC); 698 if (ret < 0) 699 break; 700 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 701 __io_close_fixed(req->ctx, issue_flags, ret); 702 ret = -EFAULT; 703 break; 704 } 705 } 706 707 if (done) 708 return done; 709 return ret; 710 } 711 712 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 713 { 714 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 715 struct io_ring_ctx *ctx = req->ctx; 716 struct io_uring_rsrc_update2 up2; 717 int ret; 718 719 up2.offset = up->offset; 720 up2.data = up->arg; 721 up2.nr = 0; 722 up2.tags = 0; 723 up2.resv = 0; 724 up2.resv2 = 0; 725 726 if (up->offset == IORING_FILE_INDEX_ALLOC) { 727 ret = io_files_update_with_index_alloc(req, issue_flags); 728 } else { 729 io_ring_submit_lock(ctx, issue_flags); 730 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 731 &up2, up->nr_args); 732 io_ring_submit_unlock(ctx, issue_flags); 733 } 734 735 if (ret < 0) 736 req_set_fail(req); 737 io_req_set_res(req, ret, 0); 738 return IOU_OK; 739 } 740 741 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 742 struct io_rsrc_node *node, void *rsrc) 743 { 744 u64 *tag_slot = io_get_tag_slot(data, idx); 745 struct io_rsrc_put *prsrc; 746 747 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 748 if (!prsrc) 749 return -ENOMEM; 750 751 prsrc->tag = *tag_slot; 752 *tag_slot = 0; 753 prsrc->rsrc = rsrc; 754 list_add(&prsrc->list, &node->rsrc_list); 755 return 0; 756 } 757 758 void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 759 { 760 #if !defined(IO_URING_SCM_ALL) 761 int i; 762 763 for (i = 0; i < ctx->nr_user_files; i++) { 764 struct file *file = io_file_from_index(&ctx->file_table, i); 765 766 if (!file) 767 continue; 768 if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) 769 continue; 770 io_file_bitmap_clear(&ctx->file_table, i); 771 fput(file); 772 } 773 #endif 774 775 #if defined(CONFIG_UNIX) 776 if (ctx->ring_sock) { 777 struct sock *sock = ctx->ring_sock->sk; 778 struct sk_buff *skb; 779 780 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 781 kfree_skb(skb); 782 } 783 #endif 784 io_free_file_tables(&ctx->file_table); 785 io_rsrc_data_free(ctx->file_data); 786 ctx->file_data = NULL; 787 ctx->nr_user_files = 0; 788 } 789 790 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 791 { 792 unsigned nr = ctx->nr_user_files; 793 int ret; 794 795 if (!ctx->file_data) 796 return -ENXIO; 797 798 /* 799 * Quiesce may unlock ->uring_lock, and while it's not held 800 * prevent new requests using the table. 801 */ 802 ctx->nr_user_files = 0; 803 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 804 ctx->nr_user_files = nr; 805 if (!ret) 806 __io_sqe_files_unregister(ctx); 807 return ret; 808 } 809 810 /* 811 * Ensure the UNIX gc is aware of our file set, so we are certain that 812 * the io_uring can be safely unregistered on process exit, even if we have 813 * loops in the file referencing. We account only files that can hold other 814 * files because otherwise they can't form a loop and so are not interesting 815 * for GC. 816 */ 817 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) 818 { 819 #if defined(CONFIG_UNIX) 820 struct sock *sk = ctx->ring_sock->sk; 821 struct sk_buff_head *head = &sk->sk_receive_queue; 822 struct scm_fp_list *fpl; 823 struct sk_buff *skb; 824 825 if (likely(!io_file_need_scm(file))) 826 return 0; 827 828 /* 829 * See if we can merge this file into an existing skb SCM_RIGHTS 830 * file set. If there's no room, fall back to allocating a new skb 831 * and filling it in. 832 */ 833 spin_lock_irq(&head->lock); 834 skb = skb_peek(head); 835 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) 836 __skb_unlink(skb, head); 837 else 838 skb = NULL; 839 spin_unlock_irq(&head->lock); 840 841 if (!skb) { 842 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 843 if (!fpl) 844 return -ENOMEM; 845 846 skb = alloc_skb(0, GFP_KERNEL); 847 if (!skb) { 848 kfree(fpl); 849 return -ENOMEM; 850 } 851 852 fpl->user = get_uid(current_user()); 853 fpl->max = SCM_MAX_FD; 854 fpl->count = 0; 855 856 UNIXCB(skb).fp = fpl; 857 skb->sk = sk; 858 skb->destructor = unix_destruct_scm; 859 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 860 } 861 862 fpl = UNIXCB(skb).fp; 863 fpl->fp[fpl->count++] = get_file(file); 864 unix_inflight(fpl->user, file); 865 skb_queue_head(head, skb); 866 fput(file); 867 #endif 868 return 0; 869 } 870 871 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 872 { 873 struct file *file = prsrc->file; 874 #if defined(CONFIG_UNIX) 875 struct sock *sock = ctx->ring_sock->sk; 876 struct sk_buff_head list, *head = &sock->sk_receive_queue; 877 struct sk_buff *skb; 878 int i; 879 880 if (!io_file_need_scm(file)) { 881 fput(file); 882 return; 883 } 884 885 __skb_queue_head_init(&list); 886 887 /* 888 * Find the skb that holds this file in its SCM_RIGHTS. When found, 889 * remove this entry and rearrange the file array. 890 */ 891 skb = skb_dequeue(head); 892 while (skb) { 893 struct scm_fp_list *fp; 894 895 fp = UNIXCB(skb).fp; 896 for (i = 0; i < fp->count; i++) { 897 int left; 898 899 if (fp->fp[i] != file) 900 continue; 901 902 unix_notinflight(fp->user, fp->fp[i]); 903 left = fp->count - 1 - i; 904 if (left) { 905 memmove(&fp->fp[i], &fp->fp[i + 1], 906 left * sizeof(struct file *)); 907 } 908 fp->count--; 909 if (!fp->count) { 910 kfree_skb(skb); 911 skb = NULL; 912 } else { 913 __skb_queue_tail(&list, skb); 914 } 915 fput(file); 916 file = NULL; 917 break; 918 } 919 920 if (!file) 921 break; 922 923 __skb_queue_tail(&list, skb); 924 925 skb = skb_dequeue(head); 926 } 927 928 if (skb_peek(&list)) { 929 spin_lock_irq(&head->lock); 930 while ((skb = __skb_dequeue(&list)) != NULL) 931 __skb_queue_tail(head, skb); 932 spin_unlock_irq(&head->lock); 933 } 934 #else 935 fput(file); 936 #endif 937 } 938 939 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 940 unsigned nr_args, u64 __user *tags) 941 { 942 __s32 __user *fds = (__s32 __user *) arg; 943 struct file *file; 944 int fd, ret; 945 unsigned i; 946 947 if (ctx->file_data) 948 return -EBUSY; 949 if (!nr_args) 950 return -EINVAL; 951 if (nr_args > IORING_MAX_FIXED_FILES) 952 return -EMFILE; 953 if (nr_args > rlimit(RLIMIT_NOFILE)) 954 return -EMFILE; 955 ret = io_rsrc_node_switch_start(ctx); 956 if (ret) 957 return ret; 958 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, 959 &ctx->file_data); 960 if (ret) 961 return ret; 962 963 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { 964 io_rsrc_data_free(ctx->file_data); 965 ctx->file_data = NULL; 966 return -ENOMEM; 967 } 968 969 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 970 struct io_fixed_file *file_slot; 971 972 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { 973 ret = -EFAULT; 974 goto fail; 975 } 976 /* allow sparse sets */ 977 if (!fds || fd == -1) { 978 ret = -EINVAL; 979 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 980 goto fail; 981 continue; 982 } 983 984 file = fget(fd); 985 ret = -EBADF; 986 if (unlikely(!file)) 987 goto fail; 988 989 /* 990 * Don't allow io_uring instances to be registered. If UNIX 991 * isn't enabled, then this causes a reference cycle and this 992 * instance can never get freed. If UNIX is enabled we'll 993 * handle it just fine, but there's still no point in allowing 994 * a ring fd as it doesn't support regular read/write anyway. 995 */ 996 if (io_is_uring_fops(file)) { 997 fput(file); 998 goto fail; 999 } 1000 ret = io_scm_file_account(ctx, file); 1001 if (ret) { 1002 fput(file); 1003 goto fail; 1004 } 1005 file_slot = io_fixed_file_slot(&ctx->file_table, i); 1006 io_fixed_file_set(file_slot, file); 1007 io_file_bitmap_set(&ctx->file_table, i); 1008 } 1009 1010 /* default it to the whole table */ 1011 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); 1012 io_rsrc_node_switch(ctx, NULL); 1013 return 0; 1014 fail: 1015 __io_sqe_files_unregister(ctx); 1016 return ret; 1017 } 1018 1019 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 1020 { 1021 io_buffer_unmap(ctx, &prsrc->buf); 1022 prsrc->buf = NULL; 1023 } 1024 1025 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 1026 { 1027 unsigned int i; 1028 1029 for (i = 0; i < ctx->nr_user_bufs; i++) 1030 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 1031 kfree(ctx->user_bufs); 1032 io_rsrc_data_free(ctx->buf_data); 1033 ctx->user_bufs = NULL; 1034 ctx->buf_data = NULL; 1035 ctx->nr_user_bufs = 0; 1036 } 1037 1038 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 1039 { 1040 unsigned nr = ctx->nr_user_bufs; 1041 int ret; 1042 1043 if (!ctx->buf_data) 1044 return -ENXIO; 1045 1046 /* 1047 * Quiesce may unlock ->uring_lock, and while it's not held 1048 * prevent new requests using the table. 1049 */ 1050 ctx->nr_user_bufs = 0; 1051 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 1052 ctx->nr_user_bufs = nr; 1053 if (!ret) 1054 __io_sqe_buffers_unregister(ctx); 1055 return ret; 1056 } 1057 1058 /* 1059 * Not super efficient, but this is just a registration time. And we do cache 1060 * the last compound head, so generally we'll only do a full search if we don't 1061 * match that one. 1062 * 1063 * We check if the given compound head page has already been accounted, to 1064 * avoid double accounting it. This allows us to account the full size of the 1065 * page, not just the constituent pages of a huge page. 1066 */ 1067 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 1068 int nr_pages, struct page *hpage) 1069 { 1070 int i, j; 1071 1072 /* check current page array */ 1073 for (i = 0; i < nr_pages; i++) { 1074 if (!PageCompound(pages[i])) 1075 continue; 1076 if (compound_head(pages[i]) == hpage) 1077 return true; 1078 } 1079 1080 /* check previously registered pages */ 1081 for (i = 0; i < ctx->nr_user_bufs; i++) { 1082 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 1083 1084 for (j = 0; j < imu->nr_bvecs; j++) { 1085 if (!PageCompound(imu->bvec[j].bv_page)) 1086 continue; 1087 if (compound_head(imu->bvec[j].bv_page) == hpage) 1088 return true; 1089 } 1090 } 1091 1092 return false; 1093 } 1094 1095 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 1096 int nr_pages, struct io_mapped_ubuf *imu, 1097 struct page **last_hpage) 1098 { 1099 int i, ret; 1100 1101 imu->acct_pages = 0; 1102 for (i = 0; i < nr_pages; i++) { 1103 if (!PageCompound(pages[i])) { 1104 imu->acct_pages++; 1105 } else { 1106 struct page *hpage; 1107 1108 hpage = compound_head(pages[i]); 1109 if (hpage == *last_hpage) 1110 continue; 1111 *last_hpage = hpage; 1112 if (headpage_already_acct(ctx, pages, i, hpage)) 1113 continue; 1114 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 1115 } 1116 } 1117 1118 if (!imu->acct_pages) 1119 return 0; 1120 1121 ret = io_account_mem(ctx, imu->acct_pages); 1122 if (ret) 1123 imu->acct_pages = 0; 1124 return ret; 1125 } 1126 1127 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) 1128 { 1129 unsigned long start, end, nr_pages; 1130 struct vm_area_struct **vmas = NULL; 1131 struct page **pages = NULL; 1132 int i, pret, ret = -ENOMEM; 1133 1134 end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1135 start = ubuf >> PAGE_SHIFT; 1136 nr_pages = end - start; 1137 1138 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 1139 if (!pages) 1140 goto done; 1141 1142 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 1143 GFP_KERNEL); 1144 if (!vmas) 1145 goto done; 1146 1147 ret = 0; 1148 mmap_read_lock(current->mm); 1149 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 1150 pages, vmas); 1151 if (pret == nr_pages) { 1152 /* don't support file backed memory */ 1153 for (i = 0; i < nr_pages; i++) { 1154 struct vm_area_struct *vma = vmas[i]; 1155 1156 if (vma_is_shmem(vma)) 1157 continue; 1158 if (vma->vm_file && 1159 !is_file_hugepages(vma->vm_file)) { 1160 ret = -EOPNOTSUPP; 1161 break; 1162 } 1163 } 1164 *npages = nr_pages; 1165 } else { 1166 ret = pret < 0 ? pret : -EFAULT; 1167 } 1168 mmap_read_unlock(current->mm); 1169 if (ret) { 1170 /* 1171 * if we did partial map, or found file backed vmas, 1172 * release any pages we did get 1173 */ 1174 if (pret > 0) 1175 unpin_user_pages(pages, pret); 1176 goto done; 1177 } 1178 ret = 0; 1179 done: 1180 kvfree(vmas); 1181 if (ret < 0) { 1182 kvfree(pages); 1183 pages = ERR_PTR(ret); 1184 } 1185 return pages; 1186 } 1187 1188 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 1189 struct io_mapped_ubuf **pimu, 1190 struct page **last_hpage) 1191 { 1192 struct io_mapped_ubuf *imu = NULL; 1193 struct page **pages = NULL; 1194 unsigned long off; 1195 size_t size; 1196 int ret, nr_pages, i; 1197 1198 *pimu = ctx->dummy_ubuf; 1199 if (!iov->iov_base) 1200 return 0; 1201 1202 ret = -ENOMEM; 1203 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 1204 &nr_pages); 1205 if (IS_ERR(pages)) { 1206 ret = PTR_ERR(pages); 1207 pages = NULL; 1208 goto done; 1209 } 1210 1211 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 1212 if (!imu) 1213 goto done; 1214 1215 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 1216 if (ret) { 1217 unpin_user_pages(pages, nr_pages); 1218 goto done; 1219 } 1220 1221 off = (unsigned long) iov->iov_base & ~PAGE_MASK; 1222 size = iov->iov_len; 1223 for (i = 0; i < nr_pages; i++) { 1224 size_t vec_len; 1225 1226 vec_len = min_t(size_t, size, PAGE_SIZE - off); 1227 imu->bvec[i].bv_page = pages[i]; 1228 imu->bvec[i].bv_len = vec_len; 1229 imu->bvec[i].bv_offset = off; 1230 off = 0; 1231 size -= vec_len; 1232 } 1233 /* store original address for later verification */ 1234 imu->ubuf = (unsigned long) iov->iov_base; 1235 imu->ubuf_end = imu->ubuf + iov->iov_len; 1236 imu->nr_bvecs = nr_pages; 1237 *pimu = imu; 1238 ret = 0; 1239 done: 1240 if (ret) 1241 kvfree(imu); 1242 kvfree(pages); 1243 return ret; 1244 } 1245 1246 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 1247 { 1248 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 1249 return ctx->user_bufs ? 0 : -ENOMEM; 1250 } 1251 1252 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 1253 unsigned int nr_args, u64 __user *tags) 1254 { 1255 struct page *last_hpage = NULL; 1256 struct io_rsrc_data *data; 1257 int i, ret; 1258 struct iovec iov; 1259 1260 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 1261 1262 if (ctx->user_bufs) 1263 return -EBUSY; 1264 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 1265 return -EINVAL; 1266 ret = io_rsrc_node_switch_start(ctx); 1267 if (ret) 1268 return ret; 1269 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); 1270 if (ret) 1271 return ret; 1272 ret = io_buffers_map_alloc(ctx, nr_args); 1273 if (ret) { 1274 io_rsrc_data_free(data); 1275 return ret; 1276 } 1277 1278 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 1279 if (arg) { 1280 ret = io_copy_iov(ctx, &iov, arg, i); 1281 if (ret) 1282 break; 1283 ret = io_buffer_validate(&iov); 1284 if (ret) 1285 break; 1286 } else { 1287 memset(&iov, 0, sizeof(iov)); 1288 } 1289 1290 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 1291 ret = -EINVAL; 1292 break; 1293 } 1294 1295 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 1296 &last_hpage); 1297 if (ret) 1298 break; 1299 } 1300 1301 WARN_ON_ONCE(ctx->buf_data); 1302 1303 ctx->buf_data = data; 1304 if (ret) 1305 __io_sqe_buffers_unregister(ctx); 1306 else 1307 io_rsrc_node_switch(ctx, NULL); 1308 return ret; 1309 } 1310 1311 int io_import_fixed(int ddir, struct iov_iter *iter, 1312 struct io_mapped_ubuf *imu, 1313 u64 buf_addr, size_t len) 1314 { 1315 u64 buf_end; 1316 size_t offset; 1317 1318 if (WARN_ON_ONCE(!imu)) 1319 return -EFAULT; 1320 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1321 return -EFAULT; 1322 /* not inside the mapped region */ 1323 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 1324 return -EFAULT; 1325 1326 /* 1327 * May not be a start of buffer, set size appropriately 1328 * and advance us to the beginning. 1329 */ 1330 offset = buf_addr - imu->ubuf; 1331 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 1332 1333 if (offset) { 1334 /* 1335 * Don't use iov_iter_advance() here, as it's really slow for 1336 * using the latter parts of a big fixed buffer - it iterates 1337 * over each segment manually. We can cheat a bit here, because 1338 * we know that: 1339 * 1340 * 1) it's a BVEC iter, we set it up 1341 * 2) all bvecs are PAGE_SIZE in size, except potentially the 1342 * first and last bvec 1343 * 1344 * So just find our index, and adjust the iterator afterwards. 1345 * If the offset is within the first bvec (or the whole first 1346 * bvec, just use iov_iter_advance(). This makes it easier 1347 * since we can just skip the first segment, which may not 1348 * be PAGE_SIZE aligned. 1349 */ 1350 const struct bio_vec *bvec = imu->bvec; 1351 1352 if (offset <= bvec->bv_len) { 1353 iov_iter_advance(iter, offset); 1354 } else { 1355 unsigned long seg_skip; 1356 1357 /* skip first vec */ 1358 offset -= bvec->bv_len; 1359 seg_skip = 1 + (offset >> PAGE_SHIFT); 1360 1361 iter->bvec = bvec + seg_skip; 1362 iter->nr_segs -= seg_skip; 1363 iter->count -= bvec->bv_len + offset; 1364 iter->iov_offset = offset & ~PAGE_MASK; 1365 } 1366 } 1367 1368 return 0; 1369 } 1370