Lines Matching +full:sig +full:- +full:dir +full:- +full:cmd
1 // SPDX-License-Identifier: GPL-2.0
14 * through a control-dependency in io_get_cqe (smp_store_release to
39 * Copyright (C) 2018-2019 Jens Axboe
40 * Copyright (c) 2018-2019 Christoph Hellwig
69 #include <linux/io_uring/cmd.h>
80 #include "io-wq.h"
137 * so that tests against ->cq_wait_nr would fail and skip wake_up().
139 #define IO_CQ_WAKE_INIT (-1U)
140 /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */
157 static int __read_mostly sysctl_io_uring_group = -1;
182 req->ctx = IO_URING_PTR_POISON;
183 req->tctx = IO_URING_PTR_POISON;
184 req->file = IO_URING_PTR_POISON;
185 req->creds = IO_URING_PTR_POISON;
186 req->io_task_work.func = IO_URING_PTR_POISON;
187 req->apoll = IO_URING_PTR_POISON;
193 req->async_data = IO_URING_PTR_POISON;
194 req->kbuf = IO_URING_PTR_POISON;
195 req->comp_list.next = IO_URING_PTR_POISON;
196 req->file_node = IO_URING_PTR_POISON;
197 req->link = IO_URING_PTR_POISON;
202 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
207 return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head);
215 if (req->flags & REQ_F_INFLIGHT)
230 if (tctx && head->tctx != tctx)
235 if (head->flags & REQ_F_LINK_TIMEOUT) {
236 struct io_ring_ctx *ctx = head->ctx;
239 raw_spin_lock_irq(&ctx->timeout_lock);
241 raw_spin_unlock_irq(&ctx->timeout_lock);
258 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
265 complete(&ctx->ref_comp);
272 struct llist_node *node = llist_del_all(&ctx->fallback_llist);
276 percpu_ref_get(&ctx->refs);
277 mutex_lock(&ctx->uring_lock);
279 req->io_task_work.func(req, ts);
281 mutex_unlock(&ctx->uring_lock);
282 percpu_ref_put(&ctx->refs);
292 table->hbs = kvmalloc_array(hash_buckets, sizeof(table->hbs[0]),
294 if (table->hbs)
297 return -ENOMEM;
298 bits--;
301 table->hash_bits = bits;
303 INIT_HLIST_HEAD(&table->hbs[i].list);
309 io_alloc_cache_free(&ctx->apoll_cache, kfree);
310 io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
311 io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
312 io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free);
327 xa_init(&ctx->io_bl_xa);
334 hash_bits = ilog2(p->cq_entries) - 5;
336 if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
338 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
342 ctx->flags = p->flags;
343 ctx->hybrid_poll_time = LLONG_MAX;
344 atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
345 init_waitqueue_head(&ctx->sqo_sq_wait);
346 INIT_LIST_HEAD(&ctx->sqd_list);
347 INIT_LIST_HEAD(&ctx->cq_overflow_list);
348 ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
350 ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
353 ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
356 ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX,
363 init_completion(&ctx->ref_comp);
364 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
365 mutex_init(&ctx->uring_lock);
366 init_waitqueue_head(&ctx->cq_wait);
367 init_waitqueue_head(&ctx->poll_wq);
368 spin_lock_init(&ctx->completion_lock);
369 raw_spin_lock_init(&ctx->timeout_lock);
370 INIT_WQ_LIST(&ctx->iopoll_list);
371 INIT_LIST_HEAD(&ctx->defer_list);
372 INIT_LIST_HEAD(&ctx->timeout_list);
373 INIT_LIST_HEAD(&ctx->ltimeout_list);
374 init_llist_head(&ctx->work_llist);
375 INIT_LIST_HEAD(&ctx->tctx_list);
376 ctx->submit_state.free_list.next = NULL;
377 INIT_HLIST_HEAD(&ctx->waitid_list);
378 xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC);
380 INIT_HLIST_HEAD(&ctx->futex_list);
382 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
383 INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
384 INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
386 mutex_init(&ctx->mmap_lock);
391 percpu_ref_exit(&ctx->refs);
394 kvfree(ctx->cancel_table.hbs);
395 xa_destroy(&ctx->io_bl_xa);
402 if (unlikely(req->flags & REQ_F_BUFFER_SELECTED))
405 if (req->flags & REQ_F_NEED_CLEANUP) {
406 const struct io_cold_def *def = &io_cold_defs[req->opcode];
408 if (def->cleanup)
409 def->cleanup(req);
411 if (req->flags & REQ_F_INFLIGHT)
412 atomic_dec(&req->tctx->inflight_tracked);
413 if (req->flags & REQ_F_CREDS)
414 put_cred(req->creds);
415 if (req->flags & REQ_F_ASYNC_DATA) {
416 kfree(req->async_data);
417 req->async_data = NULL;
419 req->flags &= ~IO_REQ_CLEAN_FLAGS;
425 * relies on ->mm being alive for the duration of the request.
429 if (!(req->flags & REQ_F_INFLIGHT)) {
430 req->flags |= REQ_F_INFLIGHT;
431 atomic_inc(&req->tctx->inflight_tracked);
437 if (WARN_ON_ONCE(!req->link))
440 req->flags &= ~REQ_F_ARM_LTIMEOUT;
441 req->flags |= REQ_F_LINK_TIMEOUT;
445 __io_req_set_refcount(req->link, 2);
446 return req->link;
451 const struct io_issue_def *def = &io_issue_defs[req->opcode];
452 struct io_ring_ctx *ctx = req->ctx;
454 if (!(req->flags & REQ_F_CREDS)) {
455 req->flags |= REQ_F_CREDS;
456 req->creds = get_current_cred();
459 req->work.list.next = NULL;
460 atomic_set(&req->work.flags, 0);
461 if (req->flags & REQ_F_FORCE_ASYNC)
462 atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags);
464 if (req->file && !(req->flags & REQ_F_FIXED_FILE))
465 req->flags |= io_file_get_flags(req->file);
467 if (req->file && (req->flags & REQ_F_ISREG)) {
468 bool should_hash = def->hash_reg_file;
471 if (should_hash && (req->file->f_flags & O_DIRECT) &&
472 (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE))
474 if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
475 io_wq_hash_work(&req->work, file_inode(req->file));
476 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
477 if (def->unbound_nonreg_file)
478 atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags);
486 if (req->flags & REQ_F_LINK_TIMEOUT) {
487 struct io_ring_ctx *ctx = req->ctx;
489 raw_spin_lock_irq(&ctx->timeout_lock);
492 raw_spin_unlock_irq(&ctx->timeout_lock);
501 struct io_uring_task *tctx = req->tctx;
505 if ((current->flags & PF_KTHREAD) || !tctx->io_wq) {
506 io_req_task_queue_fail(req, -ECANCELED);
510 /* init ->work of the whole link before punting */
516 * canceled. That will make io-wq go through the usual work cancel
520 if (WARN_ON_ONCE(!same_thread_group(tctx->task, current)))
521 atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags);
523 trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
524 io_wq_enqueue(tctx->io_wq, &req->work);
534 req->io_task_work.func = io_req_queue_iowq_tw;
552 lockdep_assert_held(&ctx->uring_lock);
555 while (!list_empty(&ctx->defer_list)) {
556 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
559 drain_seen |= de->req->flags & REQ_F_IO_DRAIN;
560 if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained)
563 list_del_init(&de->list);
564 ctx->nr_drained -= io_linked_nr(de->req);
565 io_req_task_queue(de->req);
573 if (ctx->poll_activated)
575 if (ctx->off_timeout_used)
577 if (ctx->has_evfd)
583 if (!ctx->lockless_cq)
584 spin_lock(&ctx->completion_lock);
588 __acquires(ctx->completion_lock)
590 spin_lock(&ctx->completion_lock);
596 if (!ctx->task_complete) {
597 if (!ctx->lockless_cq)
598 spin_unlock(&ctx->completion_lock);
600 if (!ctx->syscall_iopoll)
607 __releases(ctx->completion_lock)
610 spin_unlock(&ctx->completion_lock);
617 lockdep_assert_held(&ctx->uring_lock);
620 if (!dying && __io_cqring_events(ctx) == ctx->cq_entries)
624 while (!list_empty(&ctx->cq_overflow_list)) {
630 ocqe = list_first_entry(&ctx->cq_overflow_list,
632 if (ocqe->cqe.flags & IORING_CQE_F_32 ||
633 ctx->flags & IORING_SETUP_CQE32) {
641 memcpy(cqe, &ocqe->cqe, cqe_size);
643 list_del(&ocqe->list);
650 * Ideally we'd have a non-posting unlock for this, but hard
651 * to care for a non-real case.
654 ctx->cqe_sentinel = ctx->cqe_cached;
656 mutex_unlock(&ctx->uring_lock);
658 mutex_lock(&ctx->uring_lock);
663 if (list_empty(&ctx->cq_overflow_list)) {
664 clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
665 atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
672 if (ctx->rings)
678 mutex_lock(&ctx->uring_lock);
680 mutex_unlock(&ctx->uring_lock);
686 struct io_uring_task *tctx = req->tctx;
688 if (likely(tctx->task == current)) {
689 tctx->cached_refs++;
691 percpu_counter_sub(&tctx->inflight, 1);
692 if (unlikely(atomic_read(&tctx->in_cancel)))
693 wake_up(&tctx->wait);
694 put_task_struct(tctx->task);
700 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
702 percpu_counter_add(&tctx->inflight, refill);
703 refcount_add(refill, ¤t->usage);
704 tctx->cached_refs += refill;
709 struct io_uring_task *tctx = task->io_uring;
710 unsigned int refs = tctx->cached_refs;
713 tctx->cached_refs = 0;
714 percpu_counter_sub(&tctx->inflight, refs);
722 lockdep_assert_held(&ctx->completion_lock);
725 struct io_rings *r = ctx->rings;
732 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
733 set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
736 if (list_empty(&ctx->cq_overflow_list)) {
737 set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
738 atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
741 list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
753 if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) {
759 trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe);
761 ocqe->cqe.user_data = cqe->user_data;
762 ocqe->cqe.res = cqe->res;
763 ocqe->cqe.flags = cqe->flags;
765 ocqe->cqe.big_cqe[0] = big_cqe->extra1;
766 ocqe->cqe.big_cqe[1] = big_cqe->extra2;
770 big_cqe->extra1 = big_cqe->extra2 = 0;
780 if (__io_cqring_events(ctx) < ctx->cq_entries) {
781 struct io_uring_cqe *cqe = &ctx->rings->cqes[off];
783 cqe->user_data = 0;
784 cqe->res = 0;
785 cqe->flags = IORING_CQE_F_SKIP;
786 ctx->cached_cq_tail++;
799 struct io_rings *rings = ctx->rings;
800 unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
808 if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
815 if (cqe32 && off + 1 == ctx->cq_entries) {
822 queued = min(__io_cqring_events(ctx), ctx->cq_entries);
823 free = ctx->cq_entries - queued;
825 len = min(free, ctx->cq_entries - off);
829 if (ctx->flags & IORING_SETUP_CQE32) {
834 ctx->cqe_cached = &rings->cqes[off];
835 ctx->cqe_sentinel = ctx->cqe_cached + len;
844 if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))))
861 WRITE_ONCE(cqe->user_data, user_data);
862 WRITE_ONCE(cqe->res, res);
863 WRITE_ONCE(cqe->flags, cflags);
866 WRITE_ONCE(cqe->big_cqe[0], 0);
867 WRITE_ONCE(cqe->big_cqe[1], 0);
887 spin_lock(&ctx->completion_lock);
889 spin_unlock(&ctx->completion_lock);
919 * and obviously with ctx->uring_lock held (tw always has that).
923 lockdep_assert_held(&ctx->uring_lock);
924 lockdep_assert(ctx->lockless_cq);
931 ctx->submit_state.cq_flush = true;
940 struct io_ring_ctx *ctx = req->ctx;
948 if (!wq_list_empty(&ctx->submit_state.compl_reqs))
952 lockdep_assert_held(&ctx->uring_lock);
954 if (!ctx->lockless_cq) {
955 spin_lock(&ctx->completion_lock);
956 posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
957 spin_unlock(&ctx->completion_lock);
959 posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
962 ctx->submit_state.cq_flush = true;
972 struct io_ring_ctx *ctx = req->ctx;
976 lockdep_assert_held(&ctx->uring_lock);
978 cqe[0].user_data = req->cqe.user_data;
979 if (!ctx->lockless_cq) {
980 spin_lock(&ctx->completion_lock);
982 spin_unlock(&ctx->completion_lock);
987 ctx->submit_state.cq_flush = true;
993 struct io_ring_ctx *ctx = req->ctx;
997 * All execution paths but io-wq use the deferred completions by
1007 if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) {
1009 req->io_task_work.func = io_req_task_complete;
1015 if (!(req->flags & REQ_F_CQE_SKIP))
1024 * io-wq only, which holds a reference, so it cannot be the last put.
1030 __must_hold(&ctx->uring_lock)
1032 const struct io_cold_def *def = &io_cold_defs[req->opcode];
1034 lockdep_assert_held(&req->ctx->uring_lock);
1038 if (def->fail)
1039 def->fail(req);
1046 * Because of that, io_alloc_req() should be called only under ->uring_lock
1050 __must_hold(&ctx->uring_lock)
1059 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1069 percpu_ref_get_many(&ctx->refs, ret);
1070 ctx->nr_req_allocated += ret;
1072 while (ret--) {
1083 req->flags &= ~REQ_F_REFCOUNT;
1085 req->flags |= REQ_F_CQE_SKIP;
1086 req->io_task_work.func = io_req_task_complete;
1092 struct io_ring_ctx *ctx = req->ctx;
1094 spin_lock(&ctx->completion_lock);
1096 spin_unlock(&ctx->completion_lock);
1109 if (unlikely(req->flags & IO_DISARM_MASK))
1111 nxt = req->link;
1112 req->link = NULL;
1120 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
1121 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
1124 mutex_unlock(&ctx->uring_lock);
1125 percpu_ref_put(&ctx->refs);
1141 struct llist_node *next = node->next;
1145 if (req->ctx != ctx) {
1147 ctx = req->ctx;
1148 mutex_lock(&ctx->uring_lock);
1149 percpu_ref_get(&ctx->refs);
1151 INDIRECT_CALL_2(req->io_task_work.func,
1174 node = node->next;
1175 if (last_ctx != req->ctx) {
1178 flush_delayed_work(&last_ctx->fallback_work);
1179 percpu_ref_put(&last_ctx->refs);
1181 last_ctx = req->ctx;
1182 percpu_ref_get(&last_ctx->refs);
1184 if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist))
1185 schedule_delayed_work(&last_ctx->fallback_work, 1);
1190 flush_delayed_work(&last_ctx->fallback_work);
1191 percpu_ref_put(&last_ctx->refs);
1197 struct llist_node *node = llist_del_all(&tctx->task_list);
1208 if (unlikely(current->flags & PF_EXITING)) {
1213 node = llist_del_all(&tctx->task_list);
1219 /* relaxed read is enough as only the task itself sets ->in_cancel */
1220 if (unlikely(atomic_read(&tctx->in_cancel)))
1241 struct io_ring_ctx *ctx = req->ctx;
1250 * they can even be queued lazily, fall back to non-lazy.
1252 if (req->flags & IO_REQ_LINK_FLAGS)
1257 head = READ_ONCE(ctx->work_llist.first);
1268 nr_tw_prev = READ_ONCE(first_req->nr_tw);
1279 req->nr_tw = nr_tw;
1280 req->io_task_work.node.next = head;
1281 } while (!try_cmpxchg(&ctx->work_llist.first, &head,
1282 &req->io_task_work.node));
1287 * to ensure that either we see updated ->cq_wait_nr, or waiters
1293 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
1294 atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
1295 if (ctx->has_evfd)
1299 nr_wait = atomic_read(&ctx->cq_wait_nr);
1306 wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
1311 struct io_uring_task *tctx = req->tctx;
1312 struct io_ring_ctx *ctx = req->ctx;
1315 if (!llist_add(&req->io_task_work.node, &tctx->task_list))
1318 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
1319 atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
1322 if (ctx->flags & IORING_SETUP_SQPOLL) {
1323 __set_notify_signal(tctx->task);
1327 if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method)))
1335 if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)
1343 if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)))
1350 struct llist_node *node = llist_del_all(&ctx->work_llist);
1353 node = llist_del_all(&ctx->retry_llist);
1364 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
1365 atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
1376 struct llist_node *next = (*node)->next;
1379 INDIRECT_CALL_2(req->io_task_work.func,
1397 if (WARN_ON_ONCE(ctx->submitter_task != current))
1398 return -EEXIST;
1399 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
1400 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
1402 min_events -= ret;
1403 ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events);
1404 if (ctx->retry_llist.first)
1411 node = llist_reverse_order(llist_del_all(&ctx->work_llist));
1412 ret += __io_run_local_work_loop(&node, tw, max_events - ret);
1413 ctx->retry_llist.first = node;
1444 mutex_lock(&ctx->uring_lock);
1446 mutex_unlock(&ctx->uring_lock);
1452 io_tw_lock(req->ctx, tw);
1453 io_req_defer_failed(req, req->cqe.res);
1458 struct io_ring_ctx *ctx = req->ctx;
1462 io_req_defer_failed(req, -EFAULT);
1463 else if (req->flags & REQ_F_FORCE_ASYNC)
1472 req->io_task_work.func = io_req_task_cancel;
1478 req->io_task_work.func = io_req_task_submit;
1492 if (req->file_node) {
1493 io_put_rsrc_node(req->ctx, req->file_node);
1494 req->file_node = NULL;
1496 if (req->flags & REQ_F_BUF_NODE)
1497 io_put_rsrc_node(req->ctx, req->buf_node);
1502 __must_hold(&ctx->uring_lock)
1508 if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
1509 if (req->flags & REQ_F_REISSUE) {
1510 node = req->comp_list.next;
1511 req->flags &= ~REQ_F_REISSUE;
1515 if (req->flags & REQ_F_REFCOUNT) {
1516 node = req->comp_list.next;
1520 if ((req->flags & REQ_F_POLLED) && req->apoll) {
1521 struct async_poll *apoll = req->apoll;
1523 if (apoll->double_poll)
1524 kfree(apoll->double_poll);
1525 io_cache_free(&ctx->apoll_cache, apoll);
1526 req->flags &= ~REQ_F_POLLED;
1528 if (req->flags & IO_REQ_LINK_FLAGS)
1530 if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
1537 node = req->comp_list.next;
1543 __must_hold(&ctx->uring_lock)
1545 struct io_submit_state *state = &ctx->submit_state;
1549 __wq_list_for_each(node, &state->compl_reqs) {
1555 * will go through the io-wq retry machinery and post one
1558 if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
1560 if (ctx->lockless_cq)
1561 io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
1563 io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe);
1568 if (!wq_list_empty(&state->compl_reqs)) {
1569 io_free_batch_list(ctx, state->compl_reqs.first);
1570 INIT_WQ_LIST(&state->compl_reqs);
1573 if (unlikely(ctx->drain_active))
1576 ctx->submit_state.cq_flush = false;
1592 if (!(ctx->flags & IORING_SETUP_IOPOLL))
1595 mutex_lock(&ctx->uring_lock);
1596 while (!wq_list_empty(&ctx->iopoll_list)) {
1601 * Ensure we allow local-to-the-cpu processing to take place,
1606 mutex_unlock(&ctx->uring_lock);
1608 mutex_lock(&ctx->uring_lock);
1611 mutex_unlock(&ctx->uring_lock);
1613 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
1622 min_events = min(min_events, ctx->cq_entries);
1624 lockdep_assert_held(&ctx->uring_lock);
1627 return -EEXIST;
1629 check_cq = READ_ONCE(ctx->check_cq);
1638 return -EBADR;
1661 if (wq_list_empty(&ctx->iopoll_list) ||
1663 u32 tail = ctx->cached_cq_tail;
1668 wq_list_empty(&ctx->iopoll_list)) {
1669 mutex_unlock(&ctx->uring_lock);
1671 mutex_lock(&ctx->uring_lock);
1674 if (tail != ctx->cached_cq_tail ||
1675 wq_list_empty(&ctx->iopoll_list))
1683 return -EINTR;
1706 struct io_ring_ctx *ctx = req->ctx;
1711 mutex_lock(&ctx->uring_lock);
1718 if (wq_list_empty(&ctx->iopoll_list)) {
1719 ctx->poll_multi_queue = false;
1720 } else if (!ctx->poll_multi_queue) {
1723 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
1725 if (list_req->file != req->file)
1726 ctx->poll_multi_queue = true;
1733 if (READ_ONCE(req->iopoll_completed))
1734 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
1736 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
1745 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
1746 wq_has_sleeper(&ctx->sq_data->wait))
1747 wake_up(&ctx->sq_data->wait);
1749 mutex_unlock(&ctx->uring_lock);
1759 if (S_ISREG(file_inode(file)->i_mode))
1761 if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT))
1767 __must_hold(&ctx->uring_lock)
1769 struct io_ring_ctx *ctx = req->ctx;
1770 bool drain = req->flags & IOSQE_IO_DRAIN;
1775 io_req_defer_failed(req, -ENOMEM);
1781 de->req = req;
1783 ctx->nr_drained += io_linked_nr(req);
1784 list_add_tail(&de->list, &ctx->defer_list);
1786 if (!drain && list_empty(&ctx->defer_list))
1787 ctx->drain_active = false;
1793 if (req->file || !def->needs_file)
1796 if (req->flags & REQ_F_FIXED_FILE)
1797 req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
1799 req->file = io_file_get_normal(req, req->cqe.fd);
1801 return !!req->file;
1814 if (unlikely(req->flags & REQ_ISSUE_SLOW_FLAGS)) {
1815 if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
1816 creds = override_creds(req->creds);
1817 if (req->flags & REQ_F_ARM_LTIMEOUT)
1821 if (!def->audit_skip)
1822 audit_uring_entry(req->opcode);
1824 ret = def->issue(req, issue_flags);
1826 if (!def->audit_skip)
1841 const struct io_issue_def *def = &io_issue_defs[req->opcode];
1845 return -EBADF;
1862 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
1875 io_tw_lock(req->ctx, tw);
1877 WARN_ON_ONCE(!req->file);
1878 if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL))
1879 return -EFAULT;
1881 ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]);
1893 if (req->flags & IO_REQ_LINK_FLAGS)
1897 return nxt ? &nxt->work : NULL;
1903 const struct io_issue_def *def = &io_issue_defs[req->opcode];
1906 int ret = 0, err = -ECANCELED;
1908 /* one will be dropped by io_wq_free_work() after returning to io-wq */
1909 if (!(req->flags & REQ_F_REFCOUNT))
1914 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
1915 if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) {
1921 err = -EBADF;
1922 atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
1931 * Don't allow any multishot execution from io-wq. It's more restrictive
1934 if (req->flags & (REQ_F_MULTISHOT|REQ_F_APOLL_MULTISHOT)) {
1935 err = -EBADFD;
1938 if (req->file->f_flags & O_NONBLOCK ||
1939 req->file->f_mode & FMODE_NOWAIT) {
1940 err = -ECANCELED;
1945 req->flags &= ~(REQ_F_APOLL_MULTISHOT|REQ_F_MULTISHOT);
1949 if (req->flags & REQ_F_FORCE_ASYNC) {
1950 bool opcode_poll = def->pollin || def->pollout;
1960 if (ret != -EAGAIN)
1965 * poll. -EAGAIN is final for that case.
1967 if (req->flags & REQ_F_NOWAIT)
1976 if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
1999 struct io_ring_ctx *ctx = req->ctx;
2004 node = io_rsrc_node_lookup(&ctx->file_table.data, fd);
2006 node->refs++;
2007 req->file_node = node;
2008 req->flags |= io_slot_flags(node);
2029 const struct io_cold_def *def = &io_cold_defs[req->opcode];
2031 if (req->flags & REQ_F_SQE_COPIED)
2033 req->flags |= REQ_F_SQE_COPIED;
2034 if (!def->sqe_copy)
2037 return -EFAULT;
2038 def->sqe_copy(req);
2043 __must_hold(&req->ctx->uring_lock)
2045 if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
2068 __must_hold(&req->ctx->uring_lock)
2078 * doesn't support non-blocking read/write attempts
2085 __must_hold(&req->ctx->uring_lock)
2087 if (unlikely(req->flags & REQ_F_FAIL)) {
2092 req->flags &= ~REQ_F_HARDLINK;
2093 req->flags |= REQ_F_LINK;
2094 io_req_defer_failed(req, req->cqe.res);
2098 if (unlikely(req->ctx->drain_active))
2114 if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
2117 if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
2118 ctx->restrictions.sqe_flags_required)
2121 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
2122 ctx->restrictions.sqe_flags_required))
2130 struct io_kiocb *head = ctx->submit_state.link.head;
2132 ctx->drain_active = true;
2141 head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
2142 ctx->drain_next = true;
2148 /* ensure per-opcode data is cleared if we fail before prep */
2149 memset(&req->cmd.data, 0, sizeof(req->cmd.data));
2155 __must_hold(&ctx->uring_lock)
2162 req->ctx = ctx;
2163 req->opcode = opcode = READ_ONCE(sqe->opcode);
2165 sqe_flags = READ_ONCE(sqe->flags);
2166 req->flags = (__force io_req_flags_t) sqe_flags;
2167 req->cqe.user_data = READ_ONCE(sqe->user_data);
2168 req->file = NULL;
2169 req->tctx = current->io_uring;
2170 req->cancel_seq_set = false;
2171 req->async_data = NULL;
2174 req->opcode = 0;
2175 return io_init_fail_req(req, -EINVAL);
2183 return io_init_fail_req(req, -EINVAL);
2185 if (!def->buffer_select)
2186 return io_init_fail_req(req, -EOPNOTSUPP);
2187 req->buf_index = READ_ONCE(sqe->buf_group);
2190 ctx->drain_disabled = true;
2192 if (ctx->drain_disabled)
2193 return io_init_fail_req(req, -EOPNOTSUPP);
2197 if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
2198 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
2199 return io_init_fail_req(req, -EACCES);
2201 if (ctx->drain_active)
2202 req->flags |= REQ_F_FORCE_ASYNC;
2204 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
2205 ctx->drain_next = false;
2206 ctx->drain_active = true;
2207 req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
2211 if (!def->ioprio && sqe->ioprio)
2212 return io_init_fail_req(req, -EINVAL);
2213 if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
2214 return io_init_fail_req(req, -EINVAL);
2216 if (def->needs_file) {
2217 struct io_submit_state *state = &ctx->submit_state;
2219 req->cqe.fd = READ_ONCE(sqe->fd);
2225 if (state->need_plug && def->plug) {
2226 state->plug_started = true;
2227 state->need_plug = false;
2228 blk_start_plug_nr_ios(&state->plug, state->submit_nr);
2232 personality = READ_ONCE(sqe->personality);
2236 req->creds = xa_load(&ctx->personalities, personality);
2237 if (!req->creds)
2238 return io_init_fail_req(req, -EINVAL);
2239 get_cred(req->creds);
2240 ret = security_uring_override_creds(req->creds);
2242 put_cred(req->creds);
2245 req->flags |= REQ_F_CREDS;
2248 return def->prep(req, sqe);
2254 struct io_ring_ctx *ctx = req->ctx;
2255 struct io_submit_link *link = &ctx->submit_state.link;
2256 struct io_kiocb *head = link->head;
2267 if (head && !(head->flags & REQ_F_FAIL))
2268 req_fail_link_node(head, -ECANCELED);
2270 if (!(req->flags & IO_REQ_LINK_FLAGS)) {
2272 link->last->link = req;
2273 link->head = NULL;
2281 link->last->link = req;
2283 link->head = req;
2284 link->last = req;
2290 __must_hold(&ctx->uring_lock)
2292 struct io_submit_link *link = &ctx->submit_state.link;
2308 if (unlikely(link->head)) {
2309 trace_io_uring_link(req, link->last);
2311 link->last->link = req;
2312 link->last = req;
2314 if (req->flags & IO_REQ_LINK_FLAGS)
2317 req = link->head;
2318 link->head = NULL;
2319 if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
2322 } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
2324 if (req->flags & IO_REQ_LINK_FLAGS) {
2325 link->head = req;
2326 link->last = req;
2343 struct io_submit_state *state = &ctx->submit_state;
2345 if (unlikely(state->link.head))
2346 io_queue_sqe_fallback(state->link.head);
2349 if (state->plug_started)
2350 blk_finish_plug(&state->plug);
2359 state->plug_started = false;
2360 state->need_plug = max_ios > 2;
2361 state->submit_nr = max_ios;
2363 state->link.head = NULL;
2368 struct io_rings *rings = ctx->rings;
2375 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2384 * prevent a re-load down the line.
2388 unsigned mask = ctx->sq_entries - 1;
2389 unsigned head = ctx->cached_sq_head++ & mask;
2392 (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) {
2393 head = READ_ONCE(ctx->sq_array[head]);
2394 if (unlikely(head >= ctx->sq_entries)) {
2395 WRITE_ONCE(ctx->rings->sq_dropped,
2396 READ_ONCE(ctx->rings->sq_dropped) + 1);
2399 head = array_index_nospec(head, ctx->sq_entries);
2411 /* double index for 128-byte SQEs, twice as long */
2412 if (ctx->flags & IORING_SETUP_SQE128)
2414 *sqe = &ctx->sq_sqes[head];
2419 __must_hold(&ctx->uring_lock)
2430 io_submit_state_start(&ctx->submit_state, left);
2448 !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
2449 left--;
2452 } while (--left);
2455 ret -= left;
2458 ret = -EAGAIN;
2459 current->io_uring->cached_refs += left;
2477 if (io_should_wake(iowq) || io_has_work(iowq->ctx))
2479 return -1;
2492 return -EINTR;
2498 struct io_uring_task *tctx = current->io_uring;
2502 return percpu_counter_read_positive(&tctx->inflight);
2509 WRITE_ONCE(iowq->hit_timeout, 1);
2510 iowq->min_timeout = 0;
2511 wake_up_process(iowq->wq.private);
2523 struct io_ring_ctx *ctx = iowq->ctx;
2526 if (iowq->timeout == KTIME_MAX ||
2527 ktime_compare(iowq->min_timeout, iowq->timeout) >= 0)
2533 if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
2545 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
2546 atomic_set(&ctx->cq_wait_nr, 1);
2548 if (!llist_empty(&ctx->work_llist))
2552 hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup);
2553 hrtimer_set_expires(timer, iowq->timeout);
2564 if (iowq->min_timeout) {
2565 timeout = ktime_add_ns(iowq->min_timeout, start_time);
2566 hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id,
2569 timeout = iowq->timeout;
2570 hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id,
2574 hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
2575 hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
2577 if (!READ_ONCE(iowq->hit_timeout))
2580 hrtimer_cancel(&iowq->t);
2581 destroy_hrtimer_on_stack(&iowq->t);
2584 return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
2590 const sigset_t __user *sig;
2605 * can take into account that the task is waiting for IO - turns out
2608 if (ext_arg->iowait && current_pending_io())
2609 current->in_iowait = 1;
2610 if (iowq->timeout != KTIME_MAX || iowq->min_timeout)
2611 ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
2614 current->in_iowait = 0;
2624 if (unlikely(READ_ONCE(ctx->check_cq)))
2631 return -EINTR;
2646 struct io_rings *rings = ctx->rings;
2650 min_events = min_t(int, min_events, ctx->cq_entries);
2653 return -EEXIST;
2659 if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
2668 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
2669 iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
2670 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
2672 iowq.min_timeout = ext_arg->min_time;
2676 if (ext_arg->ts_set) {
2677 iowq.timeout = timespec64_to_ktime(ext_arg->ts);
2682 if (ext_arg->sig) {
2685 ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig,
2686 ext_arg->argsz);
2689 ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz);
2704 nr_wait = (int) iowq.cq_tail -
2705 READ_ONCE(ctx->rings->cq.tail);
2709 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
2710 atomic_set(&ctx->cq_wait_nr, nr_wait);
2713 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
2719 atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
2731 * Non-local task_work will be run on exit to userspace, but
2742 check_cq = READ_ONCE(ctx->check_cq);
2748 ret = -EBADR;
2760 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
2761 finish_wait(&ctx->cq_wait, &iowq.wq);
2762 restore_saved_sigmask_unless(ret == -EINTR);
2764 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2769 io_free_region(ctx, &ctx->sq_region);
2770 io_free_region(ctx, &ctx->ring_region);
2771 ctx->rings = NULL;
2772 ctx->sq_sqes = NULL;
2828 ctx->nr_req_allocated -= nr;
2829 percpu_ref_put_many(&ctx->refs, nr);
2835 guard(mutex)(&ctx->uring_lock);
2843 mutex_lock(&ctx->uring_lock);
2851 io_free_region(ctx, &ctx->param_region);
2852 mutex_unlock(&ctx->uring_lock);
2853 if (ctx->sq_creds)
2854 put_cred(ctx->sq_creds);
2855 if (ctx->submitter_task)
2856 put_task_struct(ctx->submitter_task);
2858 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
2860 if (ctx->mm_account) {
2861 mmdrop(ctx->mm_account);
2862 ctx->mm_account = NULL;
2866 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
2869 percpu_ref_exit(&ctx->refs);
2870 free_uid(ctx->user);
2873 WARN_ON_ONCE(ctx->nr_req_allocated);
2875 if (ctx->hash_map)
2876 io_wq_put_hash(ctx->hash_map);
2878 kvfree(ctx->cancel_table.hbs);
2879 xa_destroy(&ctx->io_bl_xa);
2888 mutex_lock(&ctx->uring_lock);
2889 ctx->poll_activated = true;
2890 mutex_unlock(&ctx->uring_lock);
2896 wake_up_all(&ctx->poll_wq);
2897 percpu_ref_put(&ctx->refs);
2902 spin_lock(&ctx->completion_lock);
2904 if (ctx->poll_activated || ctx->poll_wq_task_work.func)
2906 if (WARN_ON_ONCE(!ctx->task_complete))
2908 if (!ctx->submitter_task)
2911 * with ->submitter_task only the submitter task completes requests, we
2914 init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb);
2915 percpu_ref_get(&ctx->refs);
2916 if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL))
2917 percpu_ref_put(&ctx->refs);
2919 spin_unlock(&ctx->completion_lock);
2924 struct io_ring_ctx *ctx = file->private_data;
2927 if (unlikely(!ctx->poll_activated))
2933 poll_wait(file, &ctx->poll_wq, wait);
2942 * ---- ----
2943 * lock(&ctx->uring_lock);
2944 * lock(&ep->mtx);
2945 * lock(&ctx->uring_lock);
2946 * lock(&ep->mtx);
2966 struct io_uring_task *tctx = current->io_uring;
2976 if (tctx && !atomic_read(&tctx->in_cancel))
2977 io_uring_del_tctx_node((unsigned long)work->ctx);
2978 complete(&work->completion);
2985 return req->ctx == data;
2999 * submitted async (out-of-line), then completions can come in while
3004 if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
3005 mutex_lock(&ctx->uring_lock);
3007 mutex_unlock(&ctx->uring_lock);
3009 if (!xa_empty(&ctx->zcrx_ctxs)) {
3010 mutex_lock(&ctx->uring_lock);
3012 mutex_unlock(&ctx->uring_lock);
3015 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
3022 if (ctx->sq_data) {
3023 struct io_sq_data *sqd = ctx->sq_data;
3028 if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
3029 io_wq_cancel_cb(tsk->io_uring->io_wq,
3051 } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));
3057 mutex_lock(&ctx->uring_lock);
3058 while (!list_empty(&ctx->tctx_list)) {
3061 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
3064 list_rotate_left(&ctx->tctx_list);
3065 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
3069 mutex_unlock(&ctx->uring_lock);
3076 mutex_lock(&ctx->uring_lock);
3078 mutex_unlock(&ctx->uring_lock);
3079 spin_lock(&ctx->completion_lock);
3080 spin_unlock(&ctx->completion_lock);
3083 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
3094 mutex_lock(&ctx->uring_lock);
3095 percpu_ref_kill(&ctx->refs);
3096 xa_for_each(&ctx->personalities, index, creds)
3098 mutex_unlock(&ctx->uring_lock);
3100 flush_delayed_work(&ctx->fallback_work);
3102 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
3109 queue_work(iou_wq, &ctx->exit_work);
3114 struct io_ring_ctx *ctx = file->private_data;
3116 file->private_data = NULL;
3131 return io_match_task_safe(req, cancel->tctx, cancel->all);
3141 list_for_each_entry_reverse(de, &ctx->defer_list, list) {
3142 if (io_match_task_safe(de->req, tctx, cancel_all)) {
3143 list_cut_position(&list, &ctx->defer_list, &de->list);
3152 list_del_init(&de->list);
3153 ctx->nr_drained -= io_linked_nr(de->req);
3154 io_req_task_queue_fail(de->req, -ECANCELED);
3166 mutex_lock(&ctx->uring_lock);
3167 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
3168 struct io_uring_task *tctx = node->task->io_uring;
3174 if (!tctx || !tctx->io_wq)
3176 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
3179 mutex_unlock(&ctx->uring_lock);
3194 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
3195 atomic_set(&ctx->cq_wait_nr, 1);
3200 if (!ctx->rings)
3205 } else if (tctx->io_wq) {
3210 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
3216 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
3218 while (!wq_list_empty(&ctx->iopoll_list)) {
3225 if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
3228 mutex_lock(&ctx->uring_lock);
3234 mutex_unlock(&ctx->uring_lock);
3239 ret |= flush_delayed_work(&ctx->fallback_work);
3246 return atomic_read(&tctx->inflight_tracked);
3247 return percpu_counter_sum(&tctx->inflight);
3252 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
3256 struct io_uring_task *tctx = current->io_uring;
3265 if (!current->io_uring)
3267 if (tctx->io_wq)
3268 io_wq_exit_start(tctx->io_wq);
3270 atomic_inc(&tctx->in_cancel);
3284 xa_for_each(&tctx->xa, index, node) {
3286 if (node->ctx->sq_data)
3288 loop |= io_uring_try_cancel_requests(node->ctx,
3289 current->io_uring,
3294 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
3296 current->io_uring,
3306 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
3309 xa_for_each(&tctx->xa, index, node) {
3310 if (io_local_work_pending(node->ctx)) {
3311 WARN_ON_ONCE(node->ctx->submitter_task &&
3312 node->ctx->submitter_task != current);
3324 finish_wait(&tctx->wait, &wait);
3331 * ->in_cancel set for normal exit.
3333 atomic_dec(&tctx->in_cancel);
3353 return ERR_PTR(-EFAULT);
3355 /* also protects from NULL ->cq_wait_arg as the size would be 0 */
3357 end > ctx->cq_wait_size))
3358 return ERR_PTR(-EFAULT);
3360 offset = array_index_nospec(offset, ctx->cq_wait_size - size);
3361 return ctx->cq_wait_arg + offset;
3372 return -EINVAL;
3374 return -EINVAL;
3376 return -EFAULT;
3386 ext_arg->iowait = !(flags & IORING_ENTER_NO_IOWAIT);
3393 ext_arg->sig = (const sigset_t __user *) argp;
3400 if (ext_arg->argsz != sizeof(struct io_uring_reg_wait))
3401 return -EINVAL;
3406 if (w->flags & ~IORING_REG_WAIT_TS)
3407 return -EINVAL;
3408 ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC;
3409 ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask));
3410 ext_arg->argsz = READ_ONCE(w->sigmask_sz);
3411 if (w->flags & IORING_REG_WAIT_TS) {
3412 ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec);
3413 ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec);
3414 ext_arg->ts_set = true;
3420 * EXT_ARG is set - ensure we agree on the size of it and copy in our
3423 if (ext_arg->argsz != sizeof(arg))
3424 return -EINVAL;
3427 return -EFAULT;
3428 unsafe_get_user(arg.sigmask, &uarg->sigmask, uaccess_end);
3429 unsafe_get_user(arg.sigmask_sz, &uarg->sigmask_sz, uaccess_end);
3430 unsafe_get_user(arg.min_wait_usec, &uarg->min_wait_usec, uaccess_end);
3431 unsafe_get_user(arg.ts, &uarg->ts, uaccess_end);
3435 return -EFAULT;
3437 ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC;
3438 ext_arg->sig = u64_to_user_ptr(arg.sigmask);
3439 ext_arg->argsz = arg.sigmask_sz;
3441 if (get_timespec64(&ext_arg->ts, u64_to_user_ptr(arg.ts)))
3442 return -EFAULT;
3443 ext_arg->ts_set = true;
3449 return -EFAULT;
3462 return -EINVAL;
3469 struct io_uring_task *tctx = current->io_uring;
3472 return -EINVAL;
3474 file = tctx->registered_rings[fd];
3476 return -EBADF;
3480 return -EBADF;
3481 ret = -EOPNOTSUPP;
3486 ctx = file->private_data;
3487 ret = -EBADFD;
3488 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
3497 if (ctx->flags & IORING_SETUP_SQPOLL) {
3498 if (unlikely(ctx->sq_data->thread == NULL)) {
3499 ret = -EOWNERDEAD;
3503 wake_up(&ctx->sq_data->wait);
3513 mutex_lock(&ctx->uring_lock);
3516 mutex_unlock(&ctx->uring_lock);
3520 if (ctx->syscall_iopoll)
3526 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
3529 mutex_unlock(&ctx->uring_lock);
3535 if (ctx->syscall_iopoll) {
3542 mutex_lock(&ctx->uring_lock);
3547 mutex_unlock(&ctx->uring_lock);
3565 if (unlikely(ret2 == -EBADR))
3567 &ctx->check_cq);
3591 return file->f_op == &io_uring_fops;
3603 ctx->sq_entries = p->sq_entries;
3604 ctx->cq_entries = p->cq_entries;
3606 size = rings_size(ctx->flags, p->sq_entries, p->cq_entries,
3609 return -EOVERFLOW;
3613 if (ctx->flags & IORING_SETUP_NO_MMAP) {
3614 rd.user_addr = p->cq_off.user_addr;
3617 ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING);
3620 ctx->rings = rings = io_region_get_ptr(&ctx->ring_region);
3622 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
3623 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
3624 rings->sq_ring_mask = p->sq_entries - 1;
3625 rings->cq_ring_mask = p->cq_entries - 1;
3626 rings->sq_ring_entries = p->sq_entries;
3627 rings->cq_ring_entries = p->cq_entries;
3629 if (p->flags & IORING_SETUP_SQE128)
3630 size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
3632 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
3635 return -EOVERFLOW;
3640 if (ctx->flags & IORING_SETUP_NO_MMAP) {
3641 rd.user_addr = p->sq_off.user_addr;
3644 ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES);
3649 ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region);
3678 unsigned flags = p->flags;
3683 return -EINVAL;
3690 return -EINVAL;
3696 return -EINVAL;
3701 return -EINVAL;
3709 return -EINVAL;
3717 return -EINVAL;
3725 return -EINVAL;
3727 if (!(p->flags & IORING_SETUP_CLAMP))
3728 return -EINVAL;
3740 p->sq_entries = roundup_pow_of_two(entries);
3741 if (p->flags & IORING_SETUP_CQSIZE) {
3744 * to a power-of-two, if it isn't already. We do NOT impose
3747 if (!p->cq_entries)
3748 return -EINVAL;
3749 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
3750 if (!(p->flags & IORING_SETUP_CLAMP))
3751 return -EINVAL;
3752 p->cq_entries = IORING_MAX_CQ_ENTRIES;
3754 p->cq_entries = roundup_pow_of_two(p->cq_entries);
3755 if (p->cq_entries < p->sq_entries)
3756 return -EINVAL;
3758 p->cq_entries = 2 * p->sq_entries;
3761 p->sq_off.head = offsetof(struct io_rings, sq.head);
3762 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
3763 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
3764 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
3765 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
3766 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
3767 p->sq_off.resv1 = 0;
3768 if (!(p->flags & IORING_SETUP_NO_MMAP))
3769 p->sq_off.user_addr = 0;
3771 p->cq_off.head = offsetof(struct io_rings, cq.head);
3772 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
3773 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
3774 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
3775 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
3776 p->cq_off.cqes = offsetof(struct io_rings, cqes);
3777 p->cq_off.flags = offsetof(struct io_rings, cq_flags);
3778 p->cq_off.resv1 = 0;
3779 if (!(p->flags & IORING_SETUP_NO_MMAP))
3780 p->cq_off.user_addr = 0;
3803 return -ENOMEM;
3805 ctx->clockid = CLOCK_MONOTONIC;
3806 ctx->clock_offset = 0;
3808 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
3811 if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
3812 !(ctx->flags & IORING_SETUP_IOPOLL) &&
3813 !(ctx->flags & IORING_SETUP_SQPOLL))
3814 ctx->task_complete = true;
3816 if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL))
3817 ctx->lockless_cq = true;
3820 * lazy poll_wq activation relies on ->task_complete for synchronisation
3823 if (!ctx->task_complete)
3824 ctx->poll_activated = true;
3832 if (ctx->flags & IORING_SETUP_IOPOLL &&
3833 !(ctx->flags & IORING_SETUP_SQPOLL))
3834 ctx->syscall_iopoll = 1;
3836 ctx->compat = in_compat_syscall();
3838 ctx->user = get_uid(current_user());
3844 if (ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_COOP_TASKRUN))
3845 ctx->notify_method = TWA_SIGNAL_NO_IPI;
3847 ctx->notify_method = TWA_SIGNAL;
3855 mmgrab(current->mm);
3856 ctx->mm_account = current->mm;
3862 if (!(p->flags & IORING_SETUP_NO_SQARRAY))
3863 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
3869 p->features = IORING_FEAT_FLAGS;
3872 ret = -EFAULT;
3876 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
3877 && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
3882 ctx->submitter_task = get_task_struct(current);
3894 tctx = current->io_uring;
3900 if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
3907 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
3928 return -EFAULT;
3931 return -EINVAL;
3935 return -EINVAL;
3945 return -EPERM;
3952 return -EPERM;
3955 return -EPERM;
3976 .useroffset = offsetof(struct io_kiocb, cmd.data),
3977 .usersize = sizeof_field(struct io_kiocb, cmd.data),
4034 BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
4044 /* ->buf_index is u16 */
4063 /* imu->dir is u8 */
4067 * Allow user copy in the per-command field, which starts after the