xref: /linux/io_uring/wait.c (revision c17ee635fd3a482b2ad2bf5e269755c2eae5f25e)
1*0105b056SJens Axboe // SPDX-License-Identifier: GPL-2.0
2*0105b056SJens Axboe /*
3*0105b056SJens Axboe  * Waiting for completion events
4*0105b056SJens Axboe  */
5*0105b056SJens Axboe #include <linux/kernel.h>
6*0105b056SJens Axboe #include <linux/sched/signal.h>
7*0105b056SJens Axboe #include <linux/io_uring.h>
8*0105b056SJens Axboe 
9*0105b056SJens Axboe #include <trace/events/io_uring.h>
10*0105b056SJens Axboe 
11*0105b056SJens Axboe #include <uapi/linux/io_uring.h>
12*0105b056SJens Axboe 
13*0105b056SJens Axboe #include "io_uring.h"
14*0105b056SJens Axboe #include "napi.h"
15*0105b056SJens Axboe #include "wait.h"
16*0105b056SJens Axboe 
17*0105b056SJens Axboe static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
18*0105b056SJens Axboe 			    int wake_flags, void *key)
19*0105b056SJens Axboe {
20*0105b056SJens Axboe 	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq);
21*0105b056SJens Axboe 
22*0105b056SJens Axboe 	/*
23*0105b056SJens Axboe 	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
24*0105b056SJens Axboe 	 * the task, and the next invocation will do it.
25*0105b056SJens Axboe 	 */
26*0105b056SJens Axboe 	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
27*0105b056SJens Axboe 		return autoremove_wake_function(curr, mode, wake_flags, key);
28*0105b056SJens Axboe 	return -1;
29*0105b056SJens Axboe }
30*0105b056SJens Axboe 
31*0105b056SJens Axboe int io_run_task_work_sig(struct io_ring_ctx *ctx)
32*0105b056SJens Axboe {
33*0105b056SJens Axboe 	if (io_local_work_pending(ctx)) {
34*0105b056SJens Axboe 		__set_current_state(TASK_RUNNING);
35*0105b056SJens Axboe 		if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0)
36*0105b056SJens Axboe 			return 0;
37*0105b056SJens Axboe 	}
38*0105b056SJens Axboe 	if (io_run_task_work() > 0)
39*0105b056SJens Axboe 		return 0;
40*0105b056SJens Axboe 	if (task_sigpending(current))
41*0105b056SJens Axboe 		return -EINTR;
42*0105b056SJens Axboe 	return 0;
43*0105b056SJens Axboe }
44*0105b056SJens Axboe 
45*0105b056SJens Axboe static bool current_pending_io(void)
46*0105b056SJens Axboe {
47*0105b056SJens Axboe 	struct io_uring_task *tctx = current->io_uring;
48*0105b056SJens Axboe 
49*0105b056SJens Axboe 	if (!tctx)
50*0105b056SJens Axboe 		return false;
51*0105b056SJens Axboe 	return percpu_counter_read_positive(&tctx->inflight);
52*0105b056SJens Axboe }
53*0105b056SJens Axboe 
54*0105b056SJens Axboe static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
55*0105b056SJens Axboe {
56*0105b056SJens Axboe 	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
57*0105b056SJens Axboe 
58*0105b056SJens Axboe 	WRITE_ONCE(iowq->hit_timeout, 1);
59*0105b056SJens Axboe 	iowq->min_timeout = 0;
60*0105b056SJens Axboe 	wake_up_process(iowq->wq.private);
61*0105b056SJens Axboe 	return HRTIMER_NORESTART;
62*0105b056SJens Axboe }
63*0105b056SJens Axboe 
64*0105b056SJens Axboe /*
65*0105b056SJens Axboe  * Doing min_timeout portion. If we saw any timeouts, events, or have work,
66*0105b056SJens Axboe  * wake up. If not, and we have a normal timeout, switch to that and keep
67*0105b056SJens Axboe  * sleeping.
68*0105b056SJens Axboe  */
69*0105b056SJens Axboe static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
70*0105b056SJens Axboe {
71*0105b056SJens Axboe 	struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
72*0105b056SJens Axboe 	struct io_ring_ctx *ctx = iowq->ctx;
73*0105b056SJens Axboe 
74*0105b056SJens Axboe 	/* no general timeout, or shorter (or equal), we are done */
75*0105b056SJens Axboe 	if (iowq->timeout == KTIME_MAX ||
76*0105b056SJens Axboe 	    ktime_compare(iowq->min_timeout, iowq->timeout) >= 0)
77*0105b056SJens Axboe 		goto out_wake;
78*0105b056SJens Axboe 	/* work we may need to run, wake function will see if we need to wake */
79*0105b056SJens Axboe 	if (io_has_work(ctx))
80*0105b056SJens Axboe 		goto out_wake;
81*0105b056SJens Axboe 	/* got events since we started waiting, min timeout is done */
82*0105b056SJens Axboe 	if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
83*0105b056SJens Axboe 		goto out_wake;
84*0105b056SJens Axboe 	/* if we have any events and min timeout expired, we're done */
85*0105b056SJens Axboe 	if (io_cqring_events(ctx))
86*0105b056SJens Axboe 		goto out_wake;
87*0105b056SJens Axboe 
88*0105b056SJens Axboe 	/*
89*0105b056SJens Axboe 	 * If using deferred task_work running and application is waiting on
90*0105b056SJens Axboe 	 * more than one request, ensure we reset it now where we are switching
91*0105b056SJens Axboe 	 * to normal sleeps. Any request completion post min_wait should wake
92*0105b056SJens Axboe 	 * the task and return.
93*0105b056SJens Axboe 	 */
94*0105b056SJens Axboe 	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
95*0105b056SJens Axboe 		atomic_set(&ctx->cq_wait_nr, 1);
96*0105b056SJens Axboe 		smp_mb();
97*0105b056SJens Axboe 		if (!llist_empty(&ctx->work_llist))
98*0105b056SJens Axboe 			goto out_wake;
99*0105b056SJens Axboe 	}
100*0105b056SJens Axboe 
101*0105b056SJens Axboe 	/* any generated CQE posted past this time should wake us up */
102*0105b056SJens Axboe 	iowq->cq_tail = iowq->cq_min_tail;
103*0105b056SJens Axboe 
104*0105b056SJens Axboe 	hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup);
105*0105b056SJens Axboe 	hrtimer_set_expires(timer, iowq->timeout);
106*0105b056SJens Axboe 	return HRTIMER_RESTART;
107*0105b056SJens Axboe out_wake:
108*0105b056SJens Axboe 	return io_cqring_timer_wakeup(timer);
109*0105b056SJens Axboe }
110*0105b056SJens Axboe 
111*0105b056SJens Axboe static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
112*0105b056SJens Axboe 				      clockid_t clock_id, ktime_t start_time)
113*0105b056SJens Axboe {
114*0105b056SJens Axboe 	ktime_t timeout;
115*0105b056SJens Axboe 
116*0105b056SJens Axboe 	if (iowq->min_timeout) {
117*0105b056SJens Axboe 		timeout = ktime_add_ns(iowq->min_timeout, start_time);
118*0105b056SJens Axboe 		hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id,
119*0105b056SJens Axboe 				       HRTIMER_MODE_ABS);
120*0105b056SJens Axboe 	} else {
121*0105b056SJens Axboe 		timeout = iowq->timeout;
122*0105b056SJens Axboe 		hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id,
123*0105b056SJens Axboe 				       HRTIMER_MODE_ABS);
124*0105b056SJens Axboe 	}
125*0105b056SJens Axboe 
126*0105b056SJens Axboe 	hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
127*0105b056SJens Axboe 	hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
128*0105b056SJens Axboe 
129*0105b056SJens Axboe 	if (!READ_ONCE(iowq->hit_timeout))
130*0105b056SJens Axboe 		schedule();
131*0105b056SJens Axboe 
132*0105b056SJens Axboe 	hrtimer_cancel(&iowq->t);
133*0105b056SJens Axboe 	destroy_hrtimer_on_stack(&iowq->t);
134*0105b056SJens Axboe 	__set_current_state(TASK_RUNNING);
135*0105b056SJens Axboe 
136*0105b056SJens Axboe 	return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
137*0105b056SJens Axboe }
138*0105b056SJens Axboe 
139*0105b056SJens Axboe static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
140*0105b056SJens Axboe 				     struct io_wait_queue *iowq,
141*0105b056SJens Axboe 				     struct ext_arg *ext_arg,
142*0105b056SJens Axboe 				     ktime_t start_time)
143*0105b056SJens Axboe {
144*0105b056SJens Axboe 	int ret = 0;
145*0105b056SJens Axboe 
146*0105b056SJens Axboe 	/*
147*0105b056SJens Axboe 	 * Mark us as being in io_wait if we have pending requests, so cpufreq
148*0105b056SJens Axboe 	 * can take into account that the task is waiting for IO - turns out
149*0105b056SJens Axboe 	 * to be important for low QD IO.
150*0105b056SJens Axboe 	 */
151*0105b056SJens Axboe 	if (ext_arg->iowait && current_pending_io())
152*0105b056SJens Axboe 		current->in_iowait = 1;
153*0105b056SJens Axboe 	if (iowq->timeout != KTIME_MAX || iowq->min_timeout)
154*0105b056SJens Axboe 		ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
155*0105b056SJens Axboe 	else
156*0105b056SJens Axboe 		schedule();
157*0105b056SJens Axboe 	current->in_iowait = 0;
158*0105b056SJens Axboe 	return ret;
159*0105b056SJens Axboe }
160*0105b056SJens Axboe 
161*0105b056SJens Axboe /* If this returns > 0, the caller should retry */
162*0105b056SJens Axboe static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
163*0105b056SJens Axboe 					  struct io_wait_queue *iowq,
164*0105b056SJens Axboe 					  struct ext_arg *ext_arg,
165*0105b056SJens Axboe 					  ktime_t start_time)
166*0105b056SJens Axboe {
167*0105b056SJens Axboe 	if (unlikely(READ_ONCE(ctx->check_cq)))
168*0105b056SJens Axboe 		return 1;
169*0105b056SJens Axboe 	if (unlikely(io_local_work_pending(ctx)))
170*0105b056SJens Axboe 		return 1;
171*0105b056SJens Axboe 	if (unlikely(task_work_pending(current)))
172*0105b056SJens Axboe 		return 1;
173*0105b056SJens Axboe 	if (unlikely(task_sigpending(current)))
174*0105b056SJens Axboe 		return -EINTR;
175*0105b056SJens Axboe 	if (unlikely(io_should_wake(iowq)))
176*0105b056SJens Axboe 		return 0;
177*0105b056SJens Axboe 
178*0105b056SJens Axboe 	return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time);
179*0105b056SJens Axboe }
180*0105b056SJens Axboe 
181*0105b056SJens Axboe /*
182*0105b056SJens Axboe  * Wait until events become available, if we don't already have some. The
183*0105b056SJens Axboe  * application must reap them itself, as they reside on the shared cq ring.
184*0105b056SJens Axboe  */
185*0105b056SJens Axboe int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
186*0105b056SJens Axboe 		   struct ext_arg *ext_arg)
187*0105b056SJens Axboe {
188*0105b056SJens Axboe 	struct io_wait_queue iowq;
189*0105b056SJens Axboe 	struct io_rings *rings = ctx->rings;
190*0105b056SJens Axboe 	ktime_t start_time;
191*0105b056SJens Axboe 	int ret;
192*0105b056SJens Axboe 
193*0105b056SJens Axboe 	min_events = min_t(int, min_events, ctx->cq_entries);
194*0105b056SJens Axboe 
195*0105b056SJens Axboe 	if (!io_allowed_run_tw(ctx))
196*0105b056SJens Axboe 		return -EEXIST;
197*0105b056SJens Axboe 	if (io_local_work_pending(ctx))
198*0105b056SJens Axboe 		io_run_local_work(ctx, min_events,
199*0105b056SJens Axboe 				  max(IO_LOCAL_TW_DEFAULT_MAX, min_events));
200*0105b056SJens Axboe 	io_run_task_work();
201*0105b056SJens Axboe 
202*0105b056SJens Axboe 	if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
203*0105b056SJens Axboe 		io_cqring_do_overflow_flush(ctx);
204*0105b056SJens Axboe 	if (__io_cqring_events_user(ctx) >= min_events)
205*0105b056SJens Axboe 		return 0;
206*0105b056SJens Axboe 
207*0105b056SJens Axboe 	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
208*0105b056SJens Axboe 	iowq.wq.private = current;
209*0105b056SJens Axboe 	INIT_LIST_HEAD(&iowq.wq.entry);
210*0105b056SJens Axboe 	iowq.ctx = ctx;
211*0105b056SJens Axboe 	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
212*0105b056SJens Axboe 	iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
213*0105b056SJens Axboe 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
214*0105b056SJens Axboe 	iowq.hit_timeout = 0;
215*0105b056SJens Axboe 	iowq.min_timeout = ext_arg->min_time;
216*0105b056SJens Axboe 	iowq.timeout = KTIME_MAX;
217*0105b056SJens Axboe 	start_time = io_get_time(ctx);
218*0105b056SJens Axboe 
219*0105b056SJens Axboe 	if (ext_arg->ts_set) {
220*0105b056SJens Axboe 		iowq.timeout = timespec64_to_ktime(ext_arg->ts);
221*0105b056SJens Axboe 		if (!(flags & IORING_ENTER_ABS_TIMER))
222*0105b056SJens Axboe 			iowq.timeout = ktime_add(iowq.timeout, start_time);
223*0105b056SJens Axboe 	}
224*0105b056SJens Axboe 
225*0105b056SJens Axboe 	if (ext_arg->sig) {
226*0105b056SJens Axboe #ifdef CONFIG_COMPAT
227*0105b056SJens Axboe 		if (in_compat_syscall())
228*0105b056SJens Axboe 			ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig,
229*0105b056SJens Axboe 						      ext_arg->argsz);
230*0105b056SJens Axboe 		else
231*0105b056SJens Axboe #endif
232*0105b056SJens Axboe 			ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz);
233*0105b056SJens Axboe 
234*0105b056SJens Axboe 		if (ret)
235*0105b056SJens Axboe 			return ret;
236*0105b056SJens Axboe 	}
237*0105b056SJens Axboe 
238*0105b056SJens Axboe 	io_napi_busy_loop(ctx, &iowq);
239*0105b056SJens Axboe 
240*0105b056SJens Axboe 	trace_io_uring_cqring_wait(ctx, min_events);
241*0105b056SJens Axboe 	do {
242*0105b056SJens Axboe 		unsigned long check_cq;
243*0105b056SJens Axboe 		int nr_wait;
244*0105b056SJens Axboe 
245*0105b056SJens Axboe 		/* if min timeout has been hit, don't reset wait count */
246*0105b056SJens Axboe 		if (!iowq.hit_timeout)
247*0105b056SJens Axboe 			nr_wait = (int) iowq.cq_tail -
248*0105b056SJens Axboe 					READ_ONCE(ctx->rings->cq.tail);
249*0105b056SJens Axboe 		else
250*0105b056SJens Axboe 			nr_wait = 1;
251*0105b056SJens Axboe 
252*0105b056SJens Axboe 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
253*0105b056SJens Axboe 			atomic_set(&ctx->cq_wait_nr, nr_wait);
254*0105b056SJens Axboe 			set_current_state(TASK_INTERRUPTIBLE);
255*0105b056SJens Axboe 		} else {
256*0105b056SJens Axboe 			prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
257*0105b056SJens Axboe 							TASK_INTERRUPTIBLE);
258*0105b056SJens Axboe 		}
259*0105b056SJens Axboe 
260*0105b056SJens Axboe 		ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time);
261*0105b056SJens Axboe 		__set_current_state(TASK_RUNNING);
262*0105b056SJens Axboe 		atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
263*0105b056SJens Axboe 
264*0105b056SJens Axboe 		/*
265*0105b056SJens Axboe 		 * Run task_work after scheduling and before io_should_wake().
266*0105b056SJens Axboe 		 * If we got woken because of task_work being processed, run it
267*0105b056SJens Axboe 		 * now rather than let the caller do another wait loop.
268*0105b056SJens Axboe 		 */
269*0105b056SJens Axboe 		if (io_local_work_pending(ctx))
270*0105b056SJens Axboe 			io_run_local_work(ctx, nr_wait, nr_wait);
271*0105b056SJens Axboe 		io_run_task_work();
272*0105b056SJens Axboe 
273*0105b056SJens Axboe 		/*
274*0105b056SJens Axboe 		 * Non-local task_work will be run on exit to userspace, but
275*0105b056SJens Axboe 		 * if we're using DEFER_TASKRUN, then we could have waited
276*0105b056SJens Axboe 		 * with a timeout for a number of requests. If the timeout
277*0105b056SJens Axboe 		 * hits, we could have some requests ready to process. Ensure
278*0105b056SJens Axboe 		 * this break is _after_ we have run task_work, to avoid
279*0105b056SJens Axboe 		 * deferring running potentially pending requests until the
280*0105b056SJens Axboe 		 * next time we wait for events.
281*0105b056SJens Axboe 		 */
282*0105b056SJens Axboe 		if (ret < 0)
283*0105b056SJens Axboe 			break;
284*0105b056SJens Axboe 
285*0105b056SJens Axboe 		check_cq = READ_ONCE(ctx->check_cq);
286*0105b056SJens Axboe 		if (unlikely(check_cq)) {
287*0105b056SJens Axboe 			/* let the caller flush overflows, retry */
288*0105b056SJens Axboe 			if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
289*0105b056SJens Axboe 				io_cqring_do_overflow_flush(ctx);
290*0105b056SJens Axboe 			if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) {
291*0105b056SJens Axboe 				ret = -EBADR;
292*0105b056SJens Axboe 				break;
293*0105b056SJens Axboe 			}
294*0105b056SJens Axboe 		}
295*0105b056SJens Axboe 
296*0105b056SJens Axboe 		if (io_should_wake(&iowq)) {
297*0105b056SJens Axboe 			ret = 0;
298*0105b056SJens Axboe 			break;
299*0105b056SJens Axboe 		}
300*0105b056SJens Axboe 		cond_resched();
301*0105b056SJens Axboe 	} while (1);
302*0105b056SJens Axboe 
303*0105b056SJens Axboe 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
304*0105b056SJens Axboe 		finish_wait(&ctx->cq_wait, &iowq.wq);
305*0105b056SJens Axboe 	restore_saved_sigmask_unless(ret == -EINTR);
306*0105b056SJens Axboe 
307*0105b056SJens Axboe 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
308*0105b056SJens Axboe }
309