xref: /linux/io_uring/napi.c (revision 3ade6ce1255e6e97f91b8ba77408dce9d2292df2)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "io_uring.h"
4 #include "napi.h"
5 
6 #ifdef CONFIG_NET_RX_BUSY_POLL
7 
8 /* Timeout for cleanout of stale entries. */
9 #define NAPI_TIMEOUT		(60 * SEC_CONVERSION)
10 
11 struct io_napi_entry {
12 	unsigned int		napi_id;
13 	struct list_head	list;
14 
15 	unsigned long		timeout;
16 	struct hlist_node	node;
17 
18 	struct rcu_head		rcu;
19 };
20 
21 static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
22 					       unsigned int napi_id)
23 {
24 	struct io_napi_entry *e;
25 
26 	hlist_for_each_entry_rcu(e, hash_list, node) {
27 		if (e->napi_id != napi_id)
28 			continue;
29 		e->timeout = jiffies + NAPI_TIMEOUT;
30 		return e;
31 	}
32 
33 	return NULL;
34 }
35 
36 static inline ktime_t net_to_ktime(unsigned long t)
37 {
38 	/* napi approximating usecs, reverse busy_loop_current_time */
39 	return ns_to_ktime(t << 10);
40 }
41 
42 void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
43 {
44 	struct hlist_head *hash_list;
45 	unsigned int napi_id;
46 	struct sock *sk;
47 	struct io_napi_entry *e;
48 
49 	sk = sock->sk;
50 	if (!sk)
51 		return;
52 
53 	napi_id = READ_ONCE(sk->sk_napi_id);
54 
55 	/* Non-NAPI IDs can be rejected. */
56 	if (napi_id < MIN_NAPI_ID)
57 		return;
58 
59 	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
60 
61 	rcu_read_lock();
62 	e = io_napi_hash_find(hash_list, napi_id);
63 	if (e) {
64 		e->timeout = jiffies + NAPI_TIMEOUT;
65 		rcu_read_unlock();
66 		return;
67 	}
68 	rcu_read_unlock();
69 
70 	e = kmalloc(sizeof(*e), GFP_NOWAIT);
71 	if (!e)
72 		return;
73 
74 	e->napi_id = napi_id;
75 	e->timeout = jiffies + NAPI_TIMEOUT;
76 
77 	spin_lock(&ctx->napi_lock);
78 	if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
79 		spin_unlock(&ctx->napi_lock);
80 		kfree(e);
81 		return;
82 	}
83 
84 	hlist_add_tail_rcu(&e->node, hash_list);
85 	list_add_tail(&e->list, &ctx->napi_list);
86 	spin_unlock(&ctx->napi_lock);
87 }
88 
89 static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
90 {
91 	struct io_napi_entry *e;
92 	unsigned int i;
93 
94 	spin_lock(&ctx->napi_lock);
95 	hash_for_each(ctx->napi_ht, i, e, node) {
96 		if (time_after(jiffies, e->timeout)) {
97 			list_del(&e->list);
98 			hash_del_rcu(&e->node);
99 			kfree_rcu(e, rcu);
100 		}
101 	}
102 	spin_unlock(&ctx->napi_lock);
103 }
104 
105 static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
106 {
107 	if (is_stale)
108 		__io_napi_remove_stale(ctx);
109 }
110 
111 static inline bool io_napi_busy_loop_timeout(ktime_t start_time,
112 					     ktime_t bp)
113 {
114 	if (bp) {
115 		ktime_t end_time = ktime_add(start_time, bp);
116 		ktime_t now = net_to_ktime(busy_loop_current_time());
117 
118 		return ktime_after(now, end_time);
119 	}
120 
121 	return true;
122 }
123 
124 static bool io_napi_busy_loop_should_end(void *data,
125 					 unsigned long start_time)
126 {
127 	struct io_wait_queue *iowq = data;
128 
129 	if (signal_pending(current))
130 		return true;
131 	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
132 		return true;
133 	if (io_napi_busy_loop_timeout(net_to_ktime(start_time),
134 				      iowq->napi_busy_poll_dt))
135 		return true;
136 
137 	return false;
138 }
139 
140 static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
141 				   void *loop_end_arg)
142 {
143 	struct io_napi_entry *e;
144 	bool (*loop_end)(void *, unsigned long) = NULL;
145 	bool is_stale = false;
146 
147 	if (loop_end_arg)
148 		loop_end = io_napi_busy_loop_should_end;
149 
150 	list_for_each_entry_rcu(e, &ctx->napi_list, list) {
151 		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
152 				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
153 
154 		if (time_after(jiffies, e->timeout))
155 			is_stale = true;
156 	}
157 
158 	return is_stale;
159 }
160 
161 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
162 				       struct io_wait_queue *iowq)
163 {
164 	unsigned long start_time = busy_loop_current_time();
165 	void *loop_end_arg = NULL;
166 	bool is_stale = false;
167 
168 	/* Singular lists use a different napi loop end check function and are
169 	 * only executed once.
170 	 */
171 	if (list_is_singular(&ctx->napi_list))
172 		loop_end_arg = iowq;
173 
174 	rcu_read_lock();
175 	do {
176 		is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
177 	} while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
178 	rcu_read_unlock();
179 
180 	io_napi_remove_stale(ctx, is_stale);
181 }
182 
183 /*
184  * io_napi_init() - Init napi settings
185  * @ctx: pointer to io-uring context structure
186  *
187  * Init napi settings in the io-uring context.
188  */
189 void io_napi_init(struct io_ring_ctx *ctx)
190 {
191 	u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC;
192 
193 	INIT_LIST_HEAD(&ctx->napi_list);
194 	spin_lock_init(&ctx->napi_lock);
195 	ctx->napi_prefer_busy_poll = false;
196 	ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
197 }
198 
199 /*
200  * io_napi_free() - Deallocate napi
201  * @ctx: pointer to io-uring context structure
202  *
203  * Free the napi list and the hash table in the io-uring context.
204  */
205 void io_napi_free(struct io_ring_ctx *ctx)
206 {
207 	struct io_napi_entry *e;
208 	unsigned int i;
209 
210 	spin_lock(&ctx->napi_lock);
211 	hash_for_each(ctx->napi_ht, i, e, node) {
212 		hash_del_rcu(&e->node);
213 		kfree_rcu(e, rcu);
214 	}
215 	spin_unlock(&ctx->napi_lock);
216 }
217 
218 /*
219  * io_napi_register() - Register napi with io-uring
220  * @ctx: pointer to io-uring context structure
221  * @arg: pointer to io_uring_napi structure
222  *
223  * Register napi in the io-uring context.
224  */
225 int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
226 {
227 	const struct io_uring_napi curr = {
228 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
229 		.prefer_busy_poll = ctx->napi_prefer_busy_poll
230 	};
231 	struct io_uring_napi napi;
232 
233 	if (ctx->flags & IORING_SETUP_IOPOLL)
234 		return -EINVAL;
235 	if (copy_from_user(&napi, arg, sizeof(napi)))
236 		return -EFAULT;
237 	if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
238 		return -EINVAL;
239 
240 	if (copy_to_user(arg, &curr, sizeof(curr)))
241 		return -EFAULT;
242 
243 	WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC);
244 	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
245 	WRITE_ONCE(ctx->napi_enabled, true);
246 	return 0;
247 }
248 
249 /*
250  * io_napi_unregister() - Unregister napi with io-uring
251  * @ctx: pointer to io-uring context structure
252  * @arg: pointer to io_uring_napi structure
253  *
254  * Unregister napi. If arg has been specified copy the busy poll timeout and
255  * prefer busy poll setting to the passed in structure.
256  */
257 int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
258 {
259 	const struct io_uring_napi curr = {
260 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
261 		.prefer_busy_poll = ctx->napi_prefer_busy_poll
262 	};
263 
264 	if (arg && copy_to_user(arg, &curr, sizeof(curr)))
265 		return -EFAULT;
266 
267 	WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
268 	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
269 	WRITE_ONCE(ctx->napi_enabled, false);
270 	return 0;
271 }
272 
273 /*
274  * __io_napi_adjust_timeout() - adjust busy loop timeout
275  * @ctx: pointer to io-uring context structure
276  * @iowq: pointer to io wait queue
277  * @ts: pointer to timespec or NULL
278  *
279  * Adjust the busy loop timeout according to timespec and busy poll timeout.
280  * If the specified NAPI timeout is bigger than the wait timeout, then adjust
281  * the NAPI timeout accordingly.
282  */
283 void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
284 			      ktime_t to_wait)
285 {
286 	ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
287 
288 	if (to_wait)
289 		poll_dt = min(poll_dt, to_wait);
290 
291 	iowq->napi_busy_poll_dt = poll_dt;
292 }
293 
294 /*
295  * __io_napi_busy_loop() - execute busy poll loop
296  * @ctx: pointer to io-uring context structure
297  * @iowq: pointer to io wait queue
298  *
299  * Execute the busy poll loop and merge the spliced off list.
300  */
301 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
302 {
303 	iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
304 
305 	if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled)
306 		io_napi_blocking_busy_loop(ctx, iowq);
307 }
308 
309 /*
310  * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
311  * @ctx: pointer to io-uring context structure
312  *
313  * Splice of the napi list and execute the napi busy poll loop.
314  */
315 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
316 {
317 	bool is_stale = false;
318 
319 	if (!READ_ONCE(ctx->napi_busy_poll_dt))
320 		return 0;
321 	if (list_empty_careful(&ctx->napi_list))
322 		return 0;
323 
324 	rcu_read_lock();
325 	is_stale = __io_napi_do_busy_loop(ctx, NULL);
326 	rcu_read_unlock();
327 
328 	io_napi_remove_stale(ctx, is_stale);
329 	return 1;
330 }
331 
332 #endif
333