xref: /linux/io_uring/napi.c (revision b4ba157044ea433a66126603ad7140e12dbc794b)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "io_uring.h"
4 #include "napi.h"
5 
6 #ifdef CONFIG_NET_RX_BUSY_POLL
7 
8 /* Timeout for cleanout of stale entries. */
9 #define NAPI_TIMEOUT		(60 * SEC_CONVERSION)
10 
11 struct io_napi_entry {
12 	unsigned int		napi_id;
13 	struct list_head	list;
14 
15 	unsigned long		timeout;
16 	struct hlist_node	node;
17 
18 	struct rcu_head		rcu;
19 };
20 
21 static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
22 					       unsigned int napi_id)
23 {
24 	struct io_napi_entry *e;
25 
26 	hlist_for_each_entry_rcu(e, hash_list, node) {
27 		if (e->napi_id != napi_id)
28 			continue;
29 		e->timeout = jiffies + NAPI_TIMEOUT;
30 		return e;
31 	}
32 
33 	return NULL;
34 }
35 
36 void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
37 {
38 	struct hlist_head *hash_list;
39 	unsigned int napi_id;
40 	struct sock *sk;
41 	struct io_napi_entry *e;
42 
43 	sk = sock->sk;
44 	if (!sk)
45 		return;
46 
47 	napi_id = READ_ONCE(sk->sk_napi_id);
48 
49 	/* Non-NAPI IDs can be rejected. */
50 	if (napi_id < MIN_NAPI_ID)
51 		return;
52 
53 	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
54 
55 	rcu_read_lock();
56 	e = io_napi_hash_find(hash_list, napi_id);
57 	if (e) {
58 		e->timeout = jiffies + NAPI_TIMEOUT;
59 		rcu_read_unlock();
60 		return;
61 	}
62 	rcu_read_unlock();
63 
64 	e = kmalloc(sizeof(*e), GFP_NOWAIT);
65 	if (!e)
66 		return;
67 
68 	e->napi_id = napi_id;
69 	e->timeout = jiffies + NAPI_TIMEOUT;
70 
71 	spin_lock(&ctx->napi_lock);
72 	if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
73 		spin_unlock(&ctx->napi_lock);
74 		kfree(e);
75 		return;
76 	}
77 
78 	hlist_add_tail_rcu(&e->node, hash_list);
79 	list_add_tail(&e->list, &ctx->napi_list);
80 	spin_unlock(&ctx->napi_lock);
81 }
82 
83 static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
84 {
85 	struct io_napi_entry *e;
86 	unsigned int i;
87 
88 	spin_lock(&ctx->napi_lock);
89 	hash_for_each(ctx->napi_ht, i, e, node) {
90 		if (time_after(jiffies, e->timeout)) {
91 			list_del(&e->list);
92 			hash_del_rcu(&e->node);
93 			kfree_rcu(e, rcu);
94 		}
95 	}
96 	spin_unlock(&ctx->napi_lock);
97 }
98 
99 static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
100 {
101 	if (is_stale)
102 		__io_napi_remove_stale(ctx);
103 }
104 
105 static inline bool io_napi_busy_loop_timeout(unsigned long start_time,
106 					     unsigned long bp_usec)
107 {
108 	if (bp_usec) {
109 		unsigned long end_time = start_time + bp_usec;
110 		unsigned long now = busy_loop_current_time();
111 
112 		return time_after(now, end_time);
113 	}
114 
115 	return true;
116 }
117 
118 static bool io_napi_busy_loop_should_end(void *data,
119 					 unsigned long start_time)
120 {
121 	struct io_wait_queue *iowq = data;
122 
123 	if (signal_pending(current))
124 		return true;
125 	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
126 		return true;
127 	if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to))
128 		return true;
129 
130 	return false;
131 }
132 
133 static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
134 				   void *loop_end_arg)
135 {
136 	struct io_napi_entry *e;
137 	bool (*loop_end)(void *, unsigned long) = NULL;
138 	bool is_stale = false;
139 
140 	if (loop_end_arg)
141 		loop_end = io_napi_busy_loop_should_end;
142 
143 	list_for_each_entry_rcu(e, &ctx->napi_list, list) {
144 		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
145 				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
146 
147 		if (time_after(jiffies, e->timeout))
148 			is_stale = true;
149 	}
150 
151 	return is_stale;
152 }
153 
154 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
155 				       struct io_wait_queue *iowq)
156 {
157 	unsigned long start_time = busy_loop_current_time();
158 	void *loop_end_arg = NULL;
159 	bool is_stale = false;
160 
161 	/* Singular lists use a different napi loop end check function and are
162 	 * only executed once.
163 	 */
164 	if (list_is_singular(&ctx->napi_list))
165 		loop_end_arg = iowq;
166 
167 	rcu_read_lock();
168 	do {
169 		is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
170 	} while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
171 	rcu_read_unlock();
172 
173 	io_napi_remove_stale(ctx, is_stale);
174 }
175 
176 /*
177  * io_napi_init() - Init napi settings
178  * @ctx: pointer to io-uring context structure
179  *
180  * Init napi settings in the io-uring context.
181  */
182 void io_napi_init(struct io_ring_ctx *ctx)
183 {
184 	INIT_LIST_HEAD(&ctx->napi_list);
185 	spin_lock_init(&ctx->napi_lock);
186 	ctx->napi_prefer_busy_poll = false;
187 	ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
188 }
189 
190 /*
191  * io_napi_free() - Deallocate napi
192  * @ctx: pointer to io-uring context structure
193  *
194  * Free the napi list and the hash table in the io-uring context.
195  */
196 void io_napi_free(struct io_ring_ctx *ctx)
197 {
198 	struct io_napi_entry *e;
199 	LIST_HEAD(napi_list);
200 	unsigned int i;
201 
202 	spin_lock(&ctx->napi_lock);
203 	hash_for_each(ctx->napi_ht, i, e, node) {
204 		hash_del_rcu(&e->node);
205 		kfree_rcu(e, rcu);
206 	}
207 	spin_unlock(&ctx->napi_lock);
208 }
209 
210 /*
211  * io_napi_register() - Register napi with io-uring
212  * @ctx: pointer to io-uring context structure
213  * @arg: pointer to io_uring_napi structure
214  *
215  * Register napi in the io-uring context.
216  */
217 int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
218 {
219 	const struct io_uring_napi curr = {
220 		.busy_poll_to 	  = ctx->napi_busy_poll_to,
221 		.prefer_busy_poll = ctx->napi_prefer_busy_poll
222 	};
223 	struct io_uring_napi napi;
224 
225 	if (copy_from_user(&napi, arg, sizeof(napi)))
226 		return -EFAULT;
227 	if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
228 		return -EINVAL;
229 
230 	if (copy_to_user(arg, &curr, sizeof(curr)))
231 		return -EFAULT;
232 
233 	WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to);
234 	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
235 	WRITE_ONCE(ctx->napi_enabled, true);
236 	return 0;
237 }
238 
239 /*
240  * io_napi_unregister() - Unregister napi with io-uring
241  * @ctx: pointer to io-uring context structure
242  * @arg: pointer to io_uring_napi structure
243  *
244  * Unregister napi. If arg has been specified copy the busy poll timeout and
245  * prefer busy poll setting to the passed in structure.
246  */
247 int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
248 {
249 	const struct io_uring_napi curr = {
250 		.busy_poll_to 	  = ctx->napi_busy_poll_to,
251 		.prefer_busy_poll = ctx->napi_prefer_busy_poll
252 	};
253 
254 	if (arg && copy_to_user(arg, &curr, sizeof(curr)))
255 		return -EFAULT;
256 
257 	WRITE_ONCE(ctx->napi_busy_poll_to, 0);
258 	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
259 	WRITE_ONCE(ctx->napi_enabled, false);
260 	return 0;
261 }
262 
263 /*
264  * __io_napi_adjust_timeout() - adjust busy loop timeout
265  * @ctx: pointer to io-uring context structure
266  * @iowq: pointer to io wait queue
267  * @ts: pointer to timespec or NULL
268  *
269  * Adjust the busy loop timeout according to timespec and busy poll timeout.
270  * If the specified NAPI timeout is bigger than the wait timeout, then adjust
271  * the NAPI timeout accordingly.
272  */
273 void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
274 			      struct timespec64 *ts)
275 {
276 	unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to);
277 
278 	if (ts) {
279 		struct timespec64 poll_to_ts;
280 
281 		poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to);
282 		if (timespec64_compare(ts, &poll_to_ts) < 0) {
283 			s64 poll_to_ns = timespec64_to_ns(ts);
284 			if (poll_to_ns > 0) {
285 				u64 val = poll_to_ns + 999;
286 				do_div(val, (s64) 1000);
287 				poll_to = val;
288 			}
289 		}
290 	}
291 
292 	iowq->napi_busy_poll_to = poll_to;
293 }
294 
295 /*
296  * __io_napi_busy_loop() - execute busy poll loop
297  * @ctx: pointer to io-uring context structure
298  * @iowq: pointer to io wait queue
299  *
300  * Execute the busy poll loop and merge the spliced off list.
301  */
302 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
303 {
304 	iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
305 
306 	if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled)
307 		io_napi_blocking_busy_loop(ctx, iowq);
308 }
309 
310 /*
311  * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
312  * @ctx: pointer to io-uring context structure
313  *
314  * Splice of the napi list and execute the napi busy poll loop.
315  */
316 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
317 {
318 	LIST_HEAD(napi_list);
319 	bool is_stale = false;
320 
321 	if (!READ_ONCE(ctx->napi_busy_poll_to))
322 		return 0;
323 	if (list_empty_careful(&ctx->napi_list))
324 		return 0;
325 
326 	rcu_read_lock();
327 	is_stale = __io_napi_do_busy_loop(ctx, NULL);
328 	rcu_read_unlock();
329 
330 	io_napi_remove_stale(ctx, is_stale);
331 	return 1;
332 }
333 
334 #endif
335