1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/blk-mq.h> 7 #include <linux/mm.h> 8 #include <linux/slab.h> 9 #include <linux/fsnotify.h> 10 #include <linux/poll.h> 11 #include <linux/nospec.h> 12 #include <linux/compat.h> 13 #include <linux/io_uring/cmd.h> 14 #include <linux/indirect_call_wrapper.h> 15 16 #include <uapi/linux/io_uring.h> 17 18 #include "filetable.h" 19 #include "io_uring.h" 20 #include "opdef.h" 21 #include "kbuf.h" 22 #include "alloc_cache.h" 23 #include "rsrc.h" 24 #include "poll.h" 25 #include "rw.h" 26 27 static void io_complete_rw(struct kiocb *kiocb, long res); 28 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res); 29 30 struct io_rw { 31 /* NOTE: kiocb has the file as the first member, so don't do it here */ 32 struct kiocb kiocb; 33 u64 addr; 34 u32 len; 35 rwf_t flags; 36 }; 37 38 static bool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask) 39 { 40 /* If FMODE_NOWAIT is set for a file, we're golden */ 41 if (req->flags & REQ_F_SUPPORT_NOWAIT) 42 return true; 43 /* No FMODE_NOWAIT, if we can poll, check the status */ 44 if (io_file_can_poll(req)) { 45 struct poll_table_struct pt = { ._key = mask }; 46 47 return vfs_poll(req->file, &pt) & mask; 48 } 49 /* No FMODE_NOWAIT support, and file isn't pollable. Tough luck. */ 50 return false; 51 } 52 53 static int io_iov_compat_buffer_select_prep(struct io_rw *rw) 54 { 55 struct compat_iovec __user *uiov = u64_to_user_ptr(rw->addr); 56 struct compat_iovec iov; 57 58 if (copy_from_user(&iov, uiov, sizeof(iov))) 59 return -EFAULT; 60 rw->len = iov.iov_len; 61 return 0; 62 } 63 64 static int io_iov_buffer_select_prep(struct io_kiocb *req) 65 { 66 struct iovec __user *uiov; 67 struct iovec iov; 68 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 69 70 if (rw->len != 1) 71 return -EINVAL; 72 73 if (io_is_compat(req->ctx)) 74 return io_iov_compat_buffer_select_prep(rw); 75 76 uiov = u64_to_user_ptr(rw->addr); 77 if (copy_from_user(&iov, uiov, sizeof(*uiov))) 78 return -EFAULT; 79 rw->len = iov.iov_len; 80 return 0; 81 } 82 83 static int io_import_vec(int ddir, struct io_kiocb *req, 84 struct io_async_rw *io, 85 const struct iovec __user *uvec, 86 size_t uvec_segs) 87 { 88 int ret, nr_segs; 89 struct iovec *iov; 90 91 if (io->vec.iovec) { 92 nr_segs = io->vec.nr; 93 iov = io->vec.iovec; 94 } else { 95 nr_segs = 1; 96 iov = &io->fast_iov; 97 } 98 99 ret = __import_iovec(ddir, uvec, uvec_segs, nr_segs, &iov, &io->iter, 100 io_is_compat(req->ctx)); 101 if (unlikely(ret < 0)) 102 return ret; 103 if (iov) { 104 req->flags |= REQ_F_NEED_CLEANUP; 105 io_vec_reset_iovec(&io->vec, iov, io->iter.nr_segs); 106 } 107 return 0; 108 } 109 110 static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, 111 struct io_async_rw *io, struct io_br_sel *sel, 112 unsigned int issue_flags) 113 { 114 const struct io_issue_def *def = &io_issue_defs[req->opcode]; 115 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 116 size_t sqe_len = rw->len; 117 118 sel->addr = u64_to_user_ptr(rw->addr); 119 if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT)) 120 return io_import_vec(ddir, req, io, sel->addr, sqe_len); 121 122 if (io_do_buffer_select(req)) { 123 *sel = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags); 124 if (!sel->addr) 125 return -ENOBUFS; 126 rw->addr = (unsigned long) sel->addr; 127 rw->len = sqe_len; 128 } 129 return import_ubuf(ddir, sel->addr, sqe_len, &io->iter); 130 } 131 132 static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, 133 struct io_async_rw *io, 134 struct io_br_sel *sel, 135 unsigned int issue_flags) 136 { 137 int ret; 138 139 ret = __io_import_rw_buffer(rw, req, io, sel, issue_flags); 140 if (unlikely(ret < 0)) 141 return ret; 142 143 iov_iter_save_state(&io->iter, &io->iter_state); 144 return 0; 145 } 146 147 static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) 148 { 149 struct io_async_rw *rw = req->async_data; 150 151 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) 152 return; 153 154 io_alloc_cache_vec_kasan(&rw->vec); 155 if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP) 156 io_vec_free(&rw->vec); 157 158 if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) 159 io_req_async_data_clear(req, 0); 160 } 161 162 static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) 163 { 164 /* 165 * Disable quick recycling for anything that's gone through io-wq. 166 * In theory, this should be fine to cleanup. However, some read or 167 * write iter handling touches the iovec AFTER having called into the 168 * handler, eg to reexpand or revert. This means we can have: 169 * 170 * task io-wq 171 * issue 172 * punt to io-wq 173 * issue 174 * blkdev_write_iter() 175 * ->ki_complete() 176 * io_complete_rw() 177 * queue tw complete 178 * run tw 179 * req_rw_cleanup 180 * iov_iter_count() <- look at iov_iter again 181 * 182 * which can lead to a UAF. This is only possible for io-wq offload 183 * as the cleanup can run in parallel. As io-wq is not the fast path, 184 * just leave cleanup to the end. 185 * 186 * This is really a bug in the core code that does this, any issue 187 * path should assume that a successful (or -EIOCBQUEUED) return can 188 * mean that the underlying data can be gone at any time. But that 189 * should be fixed seperately, and then this check could be killed. 190 */ 191 if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { 192 req->flags &= ~REQ_F_NEED_CLEANUP; 193 io_rw_recycle(req, issue_flags); 194 } 195 } 196 197 static int io_rw_alloc_async(struct io_kiocb *req) 198 { 199 struct io_ring_ctx *ctx = req->ctx; 200 struct io_async_rw *rw; 201 202 rw = io_uring_alloc_async_data(&ctx->rw_cache, req); 203 if (!rw) 204 return -ENOMEM; 205 if (rw->vec.iovec) 206 req->flags |= REQ_F_NEED_CLEANUP; 207 rw->bytes_done = 0; 208 return 0; 209 } 210 211 static inline void io_meta_save_state(struct io_async_rw *io) 212 { 213 io->meta_state.seed = io->meta.seed; 214 iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta); 215 } 216 217 static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) 218 { 219 if (kiocb->ki_flags & IOCB_HAS_METADATA) { 220 io->meta.seed = io->meta_state.seed; 221 iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta); 222 } 223 } 224 225 static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, 226 u64 attr_ptr, u64 attr_type_mask) 227 { 228 struct io_uring_attr_pi pi_attr; 229 struct io_async_rw *io; 230 int ret; 231 232 if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr), 233 sizeof(pi_attr))) 234 return -EFAULT; 235 236 if (pi_attr.rsvd) 237 return -EINVAL; 238 239 io = req->async_data; 240 io->meta.flags = pi_attr.flags; 241 io->meta.app_tag = pi_attr.app_tag; 242 io->meta.seed = pi_attr.seed; 243 ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr), 244 pi_attr.len, &io->meta.iter); 245 if (unlikely(ret < 0)) 246 return ret; 247 req->flags |= REQ_F_HAS_METADATA; 248 io_meta_save_state(io); 249 return ret; 250 } 251 252 static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, 253 int ddir) 254 { 255 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 256 struct io_async_rw *io; 257 unsigned ioprio; 258 u64 attr_type_mask; 259 int ret; 260 261 if (io_rw_alloc_async(req)) 262 return -ENOMEM; 263 io = req->async_data; 264 265 rw->kiocb.ki_pos = READ_ONCE(sqe->off); 266 /* used for fixed read/write too - just read unconditionally */ 267 req->buf_index = READ_ONCE(sqe->buf_index); 268 io->buf_group = req->buf_index; 269 270 ioprio = READ_ONCE(sqe->ioprio); 271 if (ioprio) { 272 ret = ioprio_check_cap(ioprio); 273 if (ret) 274 return ret; 275 276 rw->kiocb.ki_ioprio = ioprio; 277 } else { 278 rw->kiocb.ki_ioprio = get_current_ioprio(); 279 } 280 rw->kiocb.dio_complete = NULL; 281 rw->kiocb.ki_flags = 0; 282 rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream); 283 284 if (req->ctx->flags & IORING_SETUP_IOPOLL) 285 rw->kiocb.ki_complete = io_complete_rw_iopoll; 286 else 287 rw->kiocb.ki_complete = io_complete_rw; 288 289 rw->addr = READ_ONCE(sqe->addr); 290 rw->len = READ_ONCE(sqe->len); 291 rw->flags = (__force rwf_t) READ_ONCE(sqe->rw_flags); 292 293 attr_type_mask = READ_ONCE(sqe->attr_type_mask); 294 if (attr_type_mask) { 295 u64 attr_ptr; 296 297 /* only PI attribute is supported currently */ 298 if (attr_type_mask != IORING_RW_ATTR_FLAG_PI) 299 return -EINVAL; 300 301 attr_ptr = READ_ONCE(sqe->attr_ptr); 302 return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); 303 } 304 return 0; 305 } 306 307 static int io_rw_do_import(struct io_kiocb *req, int ddir) 308 { 309 struct io_br_sel sel = { }; 310 311 if (io_do_buffer_select(req)) 312 return 0; 313 314 return io_import_rw_buffer(ddir, req, req->async_data, &sel, 0); 315 } 316 317 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, 318 int ddir) 319 { 320 int ret; 321 322 ret = __io_prep_rw(req, sqe, ddir); 323 if (unlikely(ret)) 324 return ret; 325 326 return io_rw_do_import(req, ddir); 327 } 328 329 int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) 330 { 331 return io_prep_rw(req, sqe, ITER_DEST); 332 } 333 334 int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) 335 { 336 return io_prep_rw(req, sqe, ITER_SOURCE); 337 } 338 339 static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, 340 int ddir) 341 { 342 int ret; 343 344 ret = io_prep_rw(req, sqe, ddir); 345 if (unlikely(ret)) 346 return ret; 347 if (!(req->flags & REQ_F_BUFFER_SELECT)) 348 return 0; 349 350 /* 351 * Have to do this validation here, as this is in io_read() rw->len 352 * might have chanaged due to buffer selection 353 */ 354 return io_iov_buffer_select_prep(req); 355 } 356 357 int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe) 358 { 359 return io_prep_rwv(req, sqe, ITER_DEST); 360 } 361 362 int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe) 363 { 364 return io_prep_rwv(req, sqe, ITER_SOURCE); 365 } 366 367 static int io_init_rw_fixed(struct io_kiocb *req, unsigned int issue_flags, 368 int ddir) 369 { 370 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 371 struct io_async_rw *io = req->async_data; 372 int ret; 373 374 if (io->bytes_done) 375 return 0; 376 377 ret = io_import_reg_buf(req, &io->iter, rw->addr, rw->len, ddir, 378 issue_flags); 379 iov_iter_save_state(&io->iter, &io->iter_state); 380 return ret; 381 } 382 383 int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) 384 { 385 return __io_prep_rw(req, sqe, ITER_DEST); 386 } 387 388 int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) 389 { 390 return __io_prep_rw(req, sqe, ITER_SOURCE); 391 } 392 393 static int io_rw_import_reg_vec(struct io_kiocb *req, 394 struct io_async_rw *io, 395 int ddir, unsigned int issue_flags) 396 { 397 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 398 unsigned uvec_segs = rw->len; 399 int ret; 400 401 ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec, 402 uvec_segs, issue_flags); 403 if (unlikely(ret)) 404 return ret; 405 iov_iter_save_state(&io->iter, &io->iter_state); 406 req->flags &= ~REQ_F_IMPORT_BUFFER; 407 return 0; 408 } 409 410 static int io_rw_prep_reg_vec(struct io_kiocb *req) 411 { 412 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 413 struct io_async_rw *io = req->async_data; 414 const struct iovec __user *uvec; 415 416 uvec = u64_to_user_ptr(rw->addr); 417 return io_prep_reg_iovec(req, &io->vec, uvec, rw->len); 418 } 419 420 int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) 421 { 422 int ret; 423 424 ret = __io_prep_rw(req, sqe, ITER_DEST); 425 if (unlikely(ret)) 426 return ret; 427 return io_rw_prep_reg_vec(req); 428 } 429 430 int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) 431 { 432 int ret; 433 434 ret = __io_prep_rw(req, sqe, ITER_SOURCE); 435 if (unlikely(ret)) 436 return ret; 437 return io_rw_prep_reg_vec(req); 438 } 439 440 /* 441 * Multishot read is prepared just like a normal read/write request, only 442 * difference is that we set the MULTISHOT flag. 443 */ 444 int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 445 { 446 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 447 int ret; 448 449 /* must be used with provided buffers */ 450 if (!(req->flags & REQ_F_BUFFER_SELECT)) 451 return -EINVAL; 452 453 ret = __io_prep_rw(req, sqe, ITER_DEST); 454 if (unlikely(ret)) 455 return ret; 456 457 if (rw->addr || rw->len) 458 return -EINVAL; 459 460 req->flags |= REQ_F_APOLL_MULTISHOT; 461 return 0; 462 } 463 464 void io_readv_writev_cleanup(struct io_kiocb *req) 465 { 466 struct io_async_rw *rw = req->async_data; 467 468 lockdep_assert_held(&req->ctx->uring_lock); 469 io_vec_free(&rw->vec); 470 io_rw_recycle(req, 0); 471 } 472 473 static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) 474 { 475 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 476 477 if (rw->kiocb.ki_pos != -1) 478 return &rw->kiocb.ki_pos; 479 480 if (!(req->file->f_mode & FMODE_STREAM)) { 481 req->flags |= REQ_F_CUR_POS; 482 rw->kiocb.ki_pos = req->file->f_pos; 483 return &rw->kiocb.ki_pos; 484 } 485 486 rw->kiocb.ki_pos = 0; 487 return NULL; 488 } 489 490 static bool io_rw_should_reissue(struct io_kiocb *req) 491 { 492 #ifdef CONFIG_BLOCK 493 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 494 umode_t mode = file_inode(req->file)->i_mode; 495 struct io_async_rw *io = req->async_data; 496 struct io_ring_ctx *ctx = req->ctx; 497 498 if (!S_ISBLK(mode) && !S_ISREG(mode)) 499 return false; 500 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && 501 !(ctx->flags & IORING_SETUP_IOPOLL))) 502 return false; 503 /* 504 * If ref is dying, we might be running poll reap from the exit work. 505 * Don't attempt to reissue from that path, just let it fail with 506 * -EAGAIN. 507 */ 508 if (percpu_ref_is_dying(&ctx->refs)) 509 return false; 510 511 io_meta_restore(io, &rw->kiocb); 512 iov_iter_restore(&io->iter, &io->iter_state); 513 return true; 514 #else 515 return false; 516 #endif 517 } 518 519 static void io_req_end_write(struct io_kiocb *req) 520 { 521 if (req->flags & REQ_F_ISREG) { 522 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 523 524 kiocb_end_write(&rw->kiocb); 525 } 526 } 527 528 /* 529 * Trigger the notifications after having done some IO, and finish the write 530 * accounting, if any. 531 */ 532 static void io_req_io_end(struct io_kiocb *req) 533 { 534 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 535 536 if (rw->kiocb.ki_flags & IOCB_WRITE) { 537 io_req_end_write(req); 538 fsnotify_modify(req->file); 539 } else { 540 fsnotify_access(req->file); 541 } 542 } 543 544 static void __io_complete_rw_common(struct io_kiocb *req, long res) 545 { 546 if (res == req->cqe.res) 547 return; 548 if ((res == -EOPNOTSUPP || res == -EAGAIN) && io_rw_should_reissue(req)) { 549 req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; 550 } else { 551 req_set_fail(req); 552 req->cqe.res = res; 553 } 554 } 555 556 static inline int io_fixup_rw_res(struct io_kiocb *req, long res) 557 { 558 struct io_async_rw *io = req->async_data; 559 560 /* add previously done IO, if any */ 561 if (req_has_async_data(req) && io->bytes_done > 0) { 562 if (res < 0) 563 res = io->bytes_done; 564 else 565 res += io->bytes_done; 566 } 567 return res; 568 } 569 570 void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) 571 { 572 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 573 struct kiocb *kiocb = &rw->kiocb; 574 575 if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) { 576 long res = kiocb->dio_complete(rw->kiocb.private); 577 578 io_req_set_res(req, io_fixup_rw_res(req, res), 0); 579 } 580 581 io_req_io_end(req); 582 583 if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) 584 req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL); 585 586 io_req_rw_cleanup(req, 0); 587 io_req_task_complete(req, tw); 588 } 589 590 static void io_complete_rw(struct kiocb *kiocb, long res) 591 { 592 struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); 593 struct io_kiocb *req = cmd_to_io_kiocb(rw); 594 595 if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { 596 __io_complete_rw_common(req, res); 597 io_req_set_res(req, io_fixup_rw_res(req, res), 0); 598 } 599 req->io_task_work.func = io_req_rw_complete; 600 __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE); 601 } 602 603 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) 604 { 605 struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); 606 struct io_kiocb *req = cmd_to_io_kiocb(rw); 607 608 if (kiocb->ki_flags & IOCB_WRITE) 609 io_req_end_write(req); 610 if (unlikely(res != req->cqe.res)) { 611 if (res == -EAGAIN && io_rw_should_reissue(req)) 612 req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; 613 else 614 req->cqe.res = res; 615 } 616 617 /* order with io_iopoll_complete() checking ->iopoll_completed */ 618 smp_store_release(&req->iopoll_completed, 1); 619 } 620 621 static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) 622 { 623 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 624 625 /* IO was queued async, completion will happen later */ 626 if (ret == -EIOCBQUEUED) 627 return; 628 629 /* transform internal restart error codes */ 630 if (unlikely(ret < 0)) { 631 switch (ret) { 632 case -ERESTARTSYS: 633 case -ERESTARTNOINTR: 634 case -ERESTARTNOHAND: 635 case -ERESTART_RESTARTBLOCK: 636 /* 637 * We can't just restart the syscall, since previously 638 * submitted sqes may already be in progress. Just fail 639 * this IO with EINTR. 640 */ 641 ret = -EINTR; 642 break; 643 } 644 } 645 646 if (req->ctx->flags & IORING_SETUP_IOPOLL) 647 io_complete_rw_iopoll(&rw->kiocb, ret); 648 else 649 io_complete_rw(&rw->kiocb, ret); 650 } 651 652 static int kiocb_done(struct io_kiocb *req, ssize_t ret, 653 struct io_br_sel *sel, unsigned int issue_flags) 654 { 655 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 656 unsigned final_ret = io_fixup_rw_res(req, ret); 657 658 if (ret >= 0 && req->flags & REQ_F_CUR_POS) 659 req->file->f_pos = rw->kiocb.ki_pos; 660 if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { 661 u32 cflags = 0; 662 663 __io_complete_rw_common(req, ret); 664 /* 665 * Safe to call io_end from here as we're inline 666 * from the submission path. 667 */ 668 io_req_io_end(req); 669 if (sel) 670 cflags = io_put_kbuf(req, ret, sel->buf_list); 671 io_req_set_res(req, final_ret, cflags); 672 io_req_rw_cleanup(req, issue_flags); 673 return IOU_COMPLETE; 674 } else { 675 io_rw_done(req, ret); 676 } 677 678 return IOU_ISSUE_SKIP_COMPLETE; 679 } 680 681 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) 682 { 683 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; 684 } 685 686 /* 687 * For files that don't have ->read_iter() and ->write_iter(), handle them 688 * by looping over ->read() or ->write() manually. 689 */ 690 static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) 691 { 692 struct io_kiocb *req = cmd_to_io_kiocb(rw); 693 struct kiocb *kiocb = &rw->kiocb; 694 struct file *file = kiocb->ki_filp; 695 ssize_t ret = 0; 696 loff_t *ppos; 697 698 /* 699 * Don't support polled IO through this interface, and we can't 700 * support non-blocking either. For the latter, this just causes 701 * the kiocb to be handled from an async context. 702 */ 703 if (kiocb->ki_flags & IOCB_HIPRI) 704 return -EOPNOTSUPP; 705 if ((kiocb->ki_flags & IOCB_NOWAIT) && 706 !(kiocb->ki_filp->f_flags & O_NONBLOCK)) 707 return -EAGAIN; 708 if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf) 709 return -EFAULT; 710 711 ppos = io_kiocb_ppos(kiocb); 712 713 while (iov_iter_count(iter)) { 714 void __user *addr; 715 size_t len; 716 ssize_t nr; 717 718 if (iter_is_ubuf(iter)) { 719 addr = iter->ubuf + iter->iov_offset; 720 len = iov_iter_count(iter); 721 } else if (!iov_iter_is_bvec(iter)) { 722 addr = iter_iov_addr(iter); 723 len = iter_iov_len(iter); 724 } else { 725 addr = u64_to_user_ptr(rw->addr); 726 len = rw->len; 727 } 728 729 if (ddir == READ) 730 nr = file->f_op->read(file, addr, len, ppos); 731 else 732 nr = file->f_op->write(file, addr, len, ppos); 733 734 if (nr < 0) { 735 if (!ret) 736 ret = nr; 737 break; 738 } 739 ret += nr; 740 if (!iov_iter_is_bvec(iter)) { 741 iov_iter_advance(iter, nr); 742 } else { 743 rw->addr += nr; 744 rw->len -= nr; 745 if (!rw->len) 746 break; 747 } 748 if (nr != len) 749 break; 750 } 751 752 return ret; 753 } 754 755 /* 756 * This is our waitqueue callback handler, registered through __folio_lock_async() 757 * when we initially tried to do the IO with the iocb armed our waitqueue. 758 * This gets called when the page is unlocked, and we generally expect that to 759 * happen when the page IO is completed and the page is now uptodate. This will 760 * queue a task_work based retry of the operation, attempting to copy the data 761 * again. If the latter fails because the page was NOT uptodate, then we will 762 * do a thread based blocking retry of the operation. That's the unexpected 763 * slow path. 764 */ 765 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, 766 int sync, void *arg) 767 { 768 struct wait_page_queue *wpq; 769 struct io_kiocb *req = wait->private; 770 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 771 struct wait_page_key *key = arg; 772 773 wpq = container_of(wait, struct wait_page_queue, wait); 774 775 if (!wake_page_match(wpq, key)) 776 return 0; 777 778 rw->kiocb.ki_flags &= ~IOCB_WAITQ; 779 list_del_init(&wait->entry); 780 io_req_task_queue(req); 781 return 1; 782 } 783 784 /* 785 * This controls whether a given IO request should be armed for async page 786 * based retry. If we return false here, the request is handed to the async 787 * worker threads for retry. If we're doing buffered reads on a regular file, 788 * we prepare a private wait_page_queue entry and retry the operation. This 789 * will either succeed because the page is now uptodate and unlocked, or it 790 * will register a callback when the page is unlocked at IO completion. Through 791 * that callback, io_uring uses task_work to setup a retry of the operation. 792 * That retry will attempt the buffered read again. The retry will generally 793 * succeed, or in rare cases where it fails, we then fall back to using the 794 * async worker threads for a blocking retry. 795 */ 796 static bool io_rw_should_retry(struct io_kiocb *req) 797 { 798 struct io_async_rw *io = req->async_data; 799 struct wait_page_queue *wait = &io->wpq; 800 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 801 struct kiocb *kiocb = &rw->kiocb; 802 803 /* 804 * Never retry for NOWAIT or a request with metadata, we just complete 805 * with -EAGAIN. 806 */ 807 if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA)) 808 return false; 809 810 /* Only for buffered IO */ 811 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) 812 return false; 813 814 /* 815 * just use poll if we can, and don't attempt if the fs doesn't 816 * support callback based unlocks 817 */ 818 if (io_file_can_poll(req) || 819 !(req->file->f_op->fop_flags & FOP_BUFFER_RASYNC)) 820 return false; 821 822 wait->wait.func = io_async_buf_func; 823 wait->wait.private = req; 824 wait->wait.flags = 0; 825 INIT_LIST_HEAD(&wait->wait.entry); 826 kiocb->ki_flags |= IOCB_WAITQ; 827 kiocb->ki_flags &= ~IOCB_NOWAIT; 828 kiocb->ki_waitq = wait; 829 return true; 830 } 831 832 static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) 833 { 834 struct file *file = rw->kiocb.ki_filp; 835 836 if (likely(file->f_op->read_iter)) 837 return file->f_op->read_iter(&rw->kiocb, iter); 838 else if (file->f_op->read) 839 return loop_rw_iter(READ, rw, iter); 840 else 841 return -EINVAL; 842 } 843 844 static bool need_complete_io(struct io_kiocb *req) 845 { 846 return req->flags & REQ_F_ISREG || 847 S_ISBLK(file_inode(req->file)->i_mode); 848 } 849 850 static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) 851 { 852 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 853 struct kiocb *kiocb = &rw->kiocb; 854 struct io_ring_ctx *ctx = req->ctx; 855 struct file *file = req->file; 856 int ret; 857 858 if (unlikely(!(file->f_mode & mode))) 859 return -EBADF; 860 861 if (!(req->flags & REQ_F_FIXED_FILE)) 862 req->flags |= io_file_get_flags(file); 863 864 kiocb->ki_flags = file->f_iocb_flags; 865 ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); 866 if (unlikely(ret)) 867 return ret; 868 kiocb->ki_flags |= IOCB_ALLOC_CACHE; 869 870 /* 871 * If the file is marked O_NONBLOCK, still allow retry for it if it 872 * supports async. Otherwise it's impossible to use O_NONBLOCK files 873 * reliably. If not, or it IOCB_NOWAIT is set, don't retry. 874 */ 875 if (kiocb->ki_flags & IOCB_NOWAIT || 876 ((file->f_flags & O_NONBLOCK && !(req->flags & REQ_F_SUPPORT_NOWAIT)))) 877 req->flags |= REQ_F_NOWAIT; 878 879 if (ctx->flags & IORING_SETUP_IOPOLL) { 880 if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) 881 return -EOPNOTSUPP; 882 kiocb->private = NULL; 883 kiocb->ki_flags |= IOCB_HIPRI; 884 req->iopoll_completed = 0; 885 if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { 886 /* make sure every req only blocks once*/ 887 req->flags &= ~REQ_F_IOPOLL_STATE; 888 req->iopoll_start = ktime_get_ns(); 889 } 890 } else { 891 if (kiocb->ki_flags & IOCB_HIPRI) 892 return -EINVAL; 893 } 894 895 if (req->flags & REQ_F_HAS_METADATA) { 896 struct io_async_rw *io = req->async_data; 897 898 if (!(file->f_mode & FMODE_HAS_METADATA)) 899 return -EINVAL; 900 901 /* 902 * We have a union of meta fields with wpq used for buffered-io 903 * in io_async_rw, so fail it here. 904 */ 905 if (!(req->file->f_flags & O_DIRECT)) 906 return -EOPNOTSUPP; 907 kiocb->ki_flags |= IOCB_HAS_METADATA; 908 kiocb->private = &io->meta; 909 } 910 911 return 0; 912 } 913 914 static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, 915 unsigned int issue_flags) 916 { 917 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 918 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 919 struct io_async_rw *io = req->async_data; 920 struct kiocb *kiocb = &rw->kiocb; 921 ssize_t ret; 922 loff_t *ppos; 923 924 if (req->flags & REQ_F_IMPORT_BUFFER) { 925 ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags); 926 if (unlikely(ret)) 927 return ret; 928 } else if (io_do_buffer_select(req)) { 929 ret = io_import_rw_buffer(ITER_DEST, req, io, sel, issue_flags); 930 if (unlikely(ret < 0)) 931 return ret; 932 } 933 ret = io_rw_init_file(req, FMODE_READ, READ); 934 if (unlikely(ret)) 935 return ret; 936 req->cqe.res = iov_iter_count(&io->iter); 937 938 if (force_nonblock) { 939 /* If the file doesn't support async, just async punt */ 940 if (unlikely(!io_file_supports_nowait(req, EPOLLIN))) 941 return -EAGAIN; 942 kiocb->ki_flags |= IOCB_NOWAIT; 943 } else { 944 /* Ensure we clear previously set non-block flag */ 945 kiocb->ki_flags &= ~IOCB_NOWAIT; 946 } 947 948 ppos = io_kiocb_update_pos(req); 949 950 ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); 951 if (unlikely(ret)) 952 return ret; 953 954 ret = io_iter_do_read(rw, &io->iter); 955 956 /* 957 * Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT 958 * issue, even though they should be returning -EAGAIN. To be safe, 959 * retry from blocking context for either. 960 */ 961 if (ret == -EOPNOTSUPP && force_nonblock) 962 ret = -EAGAIN; 963 964 if (ret == -EAGAIN) { 965 /* If we can poll, just do that. */ 966 if (io_file_can_poll(req)) 967 return -EAGAIN; 968 /* IOPOLL retry should happen for io-wq threads */ 969 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) 970 goto done; 971 /* no retry on NONBLOCK nor RWF_NOWAIT */ 972 if (req->flags & REQ_F_NOWAIT) 973 goto done; 974 ret = 0; 975 } else if (ret == -EIOCBQUEUED) { 976 return IOU_ISSUE_SKIP_COMPLETE; 977 } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || 978 (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) || 979 (issue_flags & IO_URING_F_MULTISHOT)) { 980 /* read all, failed, already did sync or don't want to retry */ 981 goto done; 982 } 983 984 /* 985 * Don't depend on the iter state matching what was consumed, or being 986 * untouched in case of error. Restore it and we'll advance it 987 * manually if we need to. 988 */ 989 iov_iter_restore(&io->iter, &io->iter_state); 990 io_meta_restore(io, kiocb); 991 992 do { 993 /* 994 * We end up here because of a partial read, either from 995 * above or inside this loop. Advance the iter by the bytes 996 * that were consumed. 997 */ 998 iov_iter_advance(&io->iter, ret); 999 if (!iov_iter_count(&io->iter)) 1000 break; 1001 io->bytes_done += ret; 1002 iov_iter_save_state(&io->iter, &io->iter_state); 1003 1004 /* if we can retry, do so with the callbacks armed */ 1005 if (!io_rw_should_retry(req)) { 1006 kiocb->ki_flags &= ~IOCB_WAITQ; 1007 return -EAGAIN; 1008 } 1009 1010 req->cqe.res = iov_iter_count(&io->iter); 1011 /* 1012 * Now retry read with the IOCB_WAITQ parts set in the iocb. If 1013 * we get -EIOCBQUEUED, then we'll get a notification when the 1014 * desired page gets unlocked. We can also get a partial read 1015 * here, and if we do, then just retry at the new offset. 1016 */ 1017 ret = io_iter_do_read(rw, &io->iter); 1018 if (ret == -EIOCBQUEUED) 1019 return IOU_ISSUE_SKIP_COMPLETE; 1020 /* we got some bytes, but not all. retry. */ 1021 kiocb->ki_flags &= ~IOCB_WAITQ; 1022 iov_iter_restore(&io->iter, &io->iter_state); 1023 } while (ret > 0); 1024 done: 1025 /* it's faster to check here then delegate to kfree */ 1026 return ret; 1027 } 1028 1029 int io_read(struct io_kiocb *req, unsigned int issue_flags) 1030 { 1031 struct io_br_sel sel = { }; 1032 int ret; 1033 1034 ret = __io_read(req, &sel, issue_flags); 1035 if (ret >= 0) 1036 return kiocb_done(req, ret, &sel, issue_flags); 1037 1038 if (req->flags & REQ_F_BUFFERS_COMMIT) 1039 io_kbuf_recycle(req, sel.buf_list, issue_flags); 1040 return ret; 1041 } 1042 1043 int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) 1044 { 1045 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 1046 struct io_br_sel sel = { }; 1047 unsigned int cflags = 0; 1048 int ret; 1049 1050 /* 1051 * Multishot MUST be used on a pollable file 1052 */ 1053 if (!io_file_can_poll(req)) 1054 return -EBADFD; 1055 1056 /* make it sync, multishot doesn't support async execution */ 1057 rw->kiocb.ki_complete = NULL; 1058 ret = __io_read(req, &sel, issue_flags); 1059 1060 /* 1061 * If we get -EAGAIN, recycle our buffer and just let normal poll 1062 * handling arm it. 1063 */ 1064 if (ret == -EAGAIN) { 1065 /* 1066 * Reset rw->len to 0 again to avoid clamping future mshot 1067 * reads, in case the buffer size varies. 1068 */ 1069 if (io_kbuf_recycle(req, sel.buf_list, issue_flags)) 1070 rw->len = 0; 1071 return IOU_RETRY; 1072 } else if (ret <= 0) { 1073 io_kbuf_recycle(req, sel.buf_list, issue_flags); 1074 if (ret < 0) 1075 req_set_fail(req); 1076 } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { 1077 cflags = io_put_kbuf(req, ret, sel.buf_list); 1078 } else { 1079 /* 1080 * Any successful return value will keep the multishot read 1081 * armed, if it's still set. Put our buffer and post a CQE. If 1082 * we fail to post a CQE, or multishot is no longer set, then 1083 * jump to the termination path. This request is then done. 1084 */ 1085 cflags = io_put_kbuf(req, ret, sel.buf_list); 1086 rw->len = 0; /* similarly to above, reset len to 0 */ 1087 1088 if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { 1089 if (issue_flags & IO_URING_F_MULTISHOT) 1090 /* 1091 * Force retry, as we might have more data to 1092 * be read and otherwise it won't get retried 1093 * until (if ever) another poll is triggered. 1094 */ 1095 io_poll_multishot_retry(req); 1096 1097 return IOU_RETRY; 1098 } 1099 } 1100 1101 /* 1102 * Either an error, or we've hit overflow posting the CQE. For any 1103 * multishot request, hitting overflow will terminate it. 1104 */ 1105 io_req_set_res(req, ret, cflags); 1106 io_req_rw_cleanup(req, issue_flags); 1107 return IOU_COMPLETE; 1108 } 1109 1110 static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb) 1111 { 1112 struct inode *inode; 1113 bool ret; 1114 1115 if (!(req->flags & REQ_F_ISREG)) 1116 return true; 1117 if (!(kiocb->ki_flags & IOCB_NOWAIT)) { 1118 kiocb_start_write(kiocb); 1119 return true; 1120 } 1121 1122 inode = file_inode(kiocb->ki_filp); 1123 ret = sb_start_write_trylock(inode->i_sb); 1124 if (ret) 1125 __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); 1126 return ret; 1127 } 1128 1129 int io_write(struct io_kiocb *req, unsigned int issue_flags) 1130 { 1131 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1132 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 1133 struct io_async_rw *io = req->async_data; 1134 struct kiocb *kiocb = &rw->kiocb; 1135 ssize_t ret, ret2; 1136 loff_t *ppos; 1137 1138 if (req->flags & REQ_F_IMPORT_BUFFER) { 1139 ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags); 1140 if (unlikely(ret)) 1141 return ret; 1142 } 1143 1144 ret = io_rw_init_file(req, FMODE_WRITE, WRITE); 1145 if (unlikely(ret)) 1146 return ret; 1147 req->cqe.res = iov_iter_count(&io->iter); 1148 1149 if (force_nonblock) { 1150 /* If the file doesn't support async, just async punt */ 1151 if (unlikely(!io_file_supports_nowait(req, EPOLLOUT))) 1152 goto ret_eagain; 1153 1154 /* Check if we can support NOWAIT. */ 1155 if (!(kiocb->ki_flags & IOCB_DIRECT) && 1156 !(req->file->f_op->fop_flags & FOP_BUFFER_WASYNC) && 1157 (req->flags & REQ_F_ISREG)) 1158 goto ret_eagain; 1159 1160 kiocb->ki_flags |= IOCB_NOWAIT; 1161 } else { 1162 /* Ensure we clear previously set non-block flag */ 1163 kiocb->ki_flags &= ~IOCB_NOWAIT; 1164 } 1165 1166 ppos = io_kiocb_update_pos(req); 1167 1168 ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); 1169 if (unlikely(ret)) 1170 return ret; 1171 1172 if (unlikely(!io_kiocb_start_write(req, kiocb))) 1173 return -EAGAIN; 1174 kiocb->ki_flags |= IOCB_WRITE; 1175 1176 if (likely(req->file->f_op->write_iter)) 1177 ret2 = req->file->f_op->write_iter(kiocb, &io->iter); 1178 else if (req->file->f_op->write) 1179 ret2 = loop_rw_iter(WRITE, rw, &io->iter); 1180 else 1181 ret2 = -EINVAL; 1182 1183 /* 1184 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just 1185 * retry them without IOCB_NOWAIT. 1186 */ 1187 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) 1188 ret2 = -EAGAIN; 1189 /* no retry on NONBLOCK nor RWF_NOWAIT */ 1190 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) 1191 goto done; 1192 if (!force_nonblock || ret2 != -EAGAIN) { 1193 /* IOPOLL retry should happen for io-wq threads */ 1194 if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) 1195 goto ret_eagain; 1196 1197 if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { 1198 trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2, 1199 req->cqe.res, ret2); 1200 1201 /* This is a partial write. The file pos has already been 1202 * updated, setup the async struct to complete the request 1203 * in the worker. Also update bytes_done to account for 1204 * the bytes already written. 1205 */ 1206 iov_iter_save_state(&io->iter, &io->iter_state); 1207 io->bytes_done += ret2; 1208 1209 if (kiocb->ki_flags & IOCB_WRITE) 1210 io_req_end_write(req); 1211 return -EAGAIN; 1212 } 1213 done: 1214 return kiocb_done(req, ret2, NULL, issue_flags); 1215 } else { 1216 ret_eagain: 1217 iov_iter_restore(&io->iter, &io->iter_state); 1218 io_meta_restore(io, kiocb); 1219 if (kiocb->ki_flags & IOCB_WRITE) 1220 io_req_end_write(req); 1221 return -EAGAIN; 1222 } 1223 } 1224 1225 int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags) 1226 { 1227 int ret; 1228 1229 ret = io_init_rw_fixed(req, issue_flags, ITER_DEST); 1230 if (unlikely(ret)) 1231 return ret; 1232 1233 return io_read(req, issue_flags); 1234 } 1235 1236 int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags) 1237 { 1238 int ret; 1239 1240 ret = io_init_rw_fixed(req, issue_flags, ITER_SOURCE); 1241 if (unlikely(ret)) 1242 return ret; 1243 1244 return io_write(req, issue_flags); 1245 } 1246 1247 void io_rw_fail(struct io_kiocb *req) 1248 { 1249 int res; 1250 1251 res = io_fixup_rw_res(req, req->cqe.res); 1252 io_req_set_res(req, res, req->cqe.flags); 1253 } 1254 1255 static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob, 1256 unsigned int poll_flags) 1257 { 1258 struct file *file = req->file; 1259 1260 if (req->opcode == IORING_OP_URING_CMD) { 1261 struct io_uring_cmd *ioucmd; 1262 1263 ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 1264 return file->f_op->uring_cmd_iopoll(ioucmd, iob, poll_flags); 1265 } else { 1266 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 1267 1268 return file->f_op->iopoll(&rw->kiocb, iob, poll_flags); 1269 } 1270 } 1271 1272 static u64 io_hybrid_iopoll_delay(struct io_ring_ctx *ctx, struct io_kiocb *req) 1273 { 1274 struct hrtimer_sleeper timer; 1275 enum hrtimer_mode mode; 1276 ktime_t kt; 1277 u64 sleep_time; 1278 1279 if (req->flags & REQ_F_IOPOLL_STATE) 1280 return 0; 1281 1282 if (ctx->hybrid_poll_time == LLONG_MAX) 1283 return 0; 1284 1285 /* Using half the running time to do schedule */ 1286 sleep_time = ctx->hybrid_poll_time / 2; 1287 1288 kt = ktime_set(0, sleep_time); 1289 req->flags |= REQ_F_IOPOLL_STATE; 1290 1291 mode = HRTIMER_MODE_REL; 1292 hrtimer_setup_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode); 1293 hrtimer_set_expires(&timer.timer, kt); 1294 set_current_state(TASK_INTERRUPTIBLE); 1295 hrtimer_sleeper_start_expires(&timer, mode); 1296 1297 if (timer.task) 1298 io_schedule(); 1299 1300 hrtimer_cancel(&timer.timer); 1301 __set_current_state(TASK_RUNNING); 1302 destroy_hrtimer_on_stack(&timer.timer); 1303 return sleep_time; 1304 } 1305 1306 static int io_uring_hybrid_poll(struct io_kiocb *req, 1307 struct io_comp_batch *iob, unsigned int poll_flags) 1308 { 1309 struct io_ring_ctx *ctx = req->ctx; 1310 u64 runtime, sleep_time; 1311 int ret; 1312 1313 sleep_time = io_hybrid_iopoll_delay(ctx, req); 1314 ret = io_uring_classic_poll(req, iob, poll_flags); 1315 runtime = ktime_get_ns() - req->iopoll_start - sleep_time; 1316 1317 /* 1318 * Use minimum sleep time if we're polling devices with different 1319 * latencies. We could get more completions from the faster ones. 1320 */ 1321 if (ctx->hybrid_poll_time > runtime) 1322 ctx->hybrid_poll_time = runtime; 1323 1324 return ret; 1325 } 1326 1327 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) 1328 { 1329 struct io_wq_work_node *pos, *start, *prev; 1330 unsigned int poll_flags = 0; 1331 DEFINE_IO_COMP_BATCH(iob); 1332 int nr_events = 0; 1333 1334 /* 1335 * Only spin for completions if we don't have multiple devices hanging 1336 * off our complete list. 1337 */ 1338 if (ctx->poll_multi_queue || force_nonspin) 1339 poll_flags |= BLK_POLL_ONESHOT; 1340 1341 wq_list_for_each(pos, start, &ctx->iopoll_list) { 1342 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); 1343 int ret; 1344 1345 /* 1346 * Move completed and retryable entries to our local lists. 1347 * If we find a request that requires polling, break out 1348 * and complete those lists first, if we have entries there. 1349 */ 1350 if (READ_ONCE(req->iopoll_completed)) 1351 break; 1352 1353 if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) 1354 ret = io_uring_hybrid_poll(req, &iob, poll_flags); 1355 else 1356 ret = io_uring_classic_poll(req, &iob, poll_flags); 1357 1358 if (unlikely(ret < 0)) 1359 return ret; 1360 else if (ret) 1361 poll_flags |= BLK_POLL_ONESHOT; 1362 1363 /* iopoll may have completed current req */ 1364 if (!rq_list_empty(&iob.req_list) || 1365 READ_ONCE(req->iopoll_completed)) 1366 break; 1367 } 1368 1369 if (!rq_list_empty(&iob.req_list)) 1370 iob.complete(&iob); 1371 else if (!pos) 1372 return 0; 1373 1374 prev = start; 1375 wq_list_for_each_resume(pos, prev) { 1376 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); 1377 1378 /* order with io_complete_rw_iopoll(), e.g. ->result updates */ 1379 if (!smp_load_acquire(&req->iopoll_completed)) 1380 break; 1381 nr_events++; 1382 req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL); 1383 if (req->opcode != IORING_OP_URING_CMD) 1384 io_req_rw_cleanup(req, 0); 1385 } 1386 if (unlikely(!nr_events)) 1387 return 0; 1388 1389 pos = start ? start->next : ctx->iopoll_list.first; 1390 wq_list_cut(&ctx->iopoll_list, prev, start); 1391 1392 if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs))) 1393 return 0; 1394 ctx->submit_state.compl_reqs.first = pos; 1395 __io_submit_flush_completions(ctx); 1396 return nr_events; 1397 } 1398 1399 void io_rw_cache_free(const void *entry) 1400 { 1401 struct io_async_rw *rw = (struct io_async_rw *) entry; 1402 1403 io_vec_free(&rw->vec); 1404 kfree(rw); 1405 } 1406