xref: /linux/io_uring/napi.c (revision e04e2b760ddbe3d7b283a05898c3a029085cd8cd)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "io_uring.h"
4 #include "napi.h"
5 
6 #ifdef CONFIG_NET_RX_BUSY_POLL
7 
8 /* Timeout for cleanout of stale entries. */
9 #define NAPI_TIMEOUT		(60 * SEC_CONVERSION)
10 
11 struct io_napi_entry {
12 	unsigned int		napi_id;
13 	struct list_head	list;
14 
15 	unsigned long		timeout;
16 	struct hlist_node	node;
17 
18 	struct rcu_head		rcu;
19 };
20 
21 static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
22 					       unsigned int napi_id)
23 {
24 	struct io_napi_entry *e;
25 
26 	hlist_for_each_entry_rcu(e, hash_list, node) {
27 		if (e->napi_id != napi_id)
28 			continue;
29 		return e;
30 	}
31 
32 	return NULL;
33 }
34 
35 static inline ktime_t net_to_ktime(unsigned long t)
36 {
37 	/* napi approximating usecs, reverse busy_loop_current_time */
38 	return ns_to_ktime(t << 10);
39 }
40 
41 void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
42 {
43 	struct hlist_head *hash_list;
44 	unsigned int napi_id;
45 	struct sock *sk;
46 	struct io_napi_entry *e;
47 
48 	sk = sock->sk;
49 	if (!sk)
50 		return;
51 
52 	napi_id = READ_ONCE(sk->sk_napi_id);
53 
54 	/* Non-NAPI IDs can be rejected. */
55 	if (napi_id < MIN_NAPI_ID)
56 		return;
57 
58 	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
59 
60 	rcu_read_lock();
61 	e = io_napi_hash_find(hash_list, napi_id);
62 	if (e) {
63 		e->timeout = jiffies + NAPI_TIMEOUT;
64 		rcu_read_unlock();
65 		return;
66 	}
67 	rcu_read_unlock();
68 
69 	e = kmalloc(sizeof(*e), GFP_NOWAIT);
70 	if (!e)
71 		return;
72 
73 	e->napi_id = napi_id;
74 	e->timeout = jiffies + NAPI_TIMEOUT;
75 
76 	spin_lock(&ctx->napi_lock);
77 	if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
78 		spin_unlock(&ctx->napi_lock);
79 		kfree(e);
80 		return;
81 	}
82 
83 	hlist_add_tail_rcu(&e->node, hash_list);
84 	list_add_tail(&e->list, &ctx->napi_list);
85 	spin_unlock(&ctx->napi_lock);
86 }
87 
88 static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
89 {
90 	struct io_napi_entry *e;
91 	unsigned int i;
92 
93 	spin_lock(&ctx->napi_lock);
94 	hash_for_each(ctx->napi_ht, i, e, node) {
95 		if (time_after(jiffies, e->timeout)) {
96 			list_del(&e->list);
97 			hash_del_rcu(&e->node);
98 			kfree_rcu(e, rcu);
99 		}
100 	}
101 	spin_unlock(&ctx->napi_lock);
102 }
103 
104 static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
105 {
106 	if (is_stale)
107 		__io_napi_remove_stale(ctx);
108 }
109 
110 static inline bool io_napi_busy_loop_timeout(ktime_t start_time,
111 					     ktime_t bp)
112 {
113 	if (bp) {
114 		ktime_t end_time = ktime_add(start_time, bp);
115 		ktime_t now = net_to_ktime(busy_loop_current_time());
116 
117 		return ktime_after(now, end_time);
118 	}
119 
120 	return true;
121 }
122 
123 static bool io_napi_busy_loop_should_end(void *data,
124 					 unsigned long start_time)
125 {
126 	struct io_wait_queue *iowq = data;
127 
128 	if (signal_pending(current))
129 		return true;
130 	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
131 		return true;
132 	if (io_napi_busy_loop_timeout(net_to_ktime(start_time),
133 				      iowq->napi_busy_poll_dt))
134 		return true;
135 
136 	return false;
137 }
138 
139 static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
140 				   void *loop_end_arg)
141 {
142 	struct io_napi_entry *e;
143 	bool (*loop_end)(void *, unsigned long) = NULL;
144 	bool is_stale = false;
145 
146 	if (loop_end_arg)
147 		loop_end = io_napi_busy_loop_should_end;
148 
149 	list_for_each_entry_rcu(e, &ctx->napi_list, list) {
150 		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
151 				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
152 
153 		if (time_after(jiffies, e->timeout))
154 			is_stale = true;
155 	}
156 
157 	return is_stale;
158 }
159 
160 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
161 				       struct io_wait_queue *iowq)
162 {
163 	unsigned long start_time = busy_loop_current_time();
164 	void *loop_end_arg = NULL;
165 	bool is_stale = false;
166 
167 	/* Singular lists use a different napi loop end check function and are
168 	 * only executed once.
169 	 */
170 	if (list_is_singular(&ctx->napi_list))
171 		loop_end_arg = iowq;
172 
173 	rcu_read_lock();
174 	do {
175 		is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
176 	} while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
177 	rcu_read_unlock();
178 
179 	io_napi_remove_stale(ctx, is_stale);
180 }
181 
182 /*
183  * io_napi_init() - Init napi settings
184  * @ctx: pointer to io-uring context structure
185  *
186  * Init napi settings in the io-uring context.
187  */
188 void io_napi_init(struct io_ring_ctx *ctx)
189 {
190 	u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC;
191 
192 	INIT_LIST_HEAD(&ctx->napi_list);
193 	spin_lock_init(&ctx->napi_lock);
194 	ctx->napi_prefer_busy_poll = false;
195 	ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
196 }
197 
198 /*
199  * io_napi_free() - Deallocate napi
200  * @ctx: pointer to io-uring context structure
201  *
202  * Free the napi list and the hash table in the io-uring context.
203  */
204 void io_napi_free(struct io_ring_ctx *ctx)
205 {
206 	struct io_napi_entry *e;
207 	unsigned int i;
208 
209 	spin_lock(&ctx->napi_lock);
210 	hash_for_each(ctx->napi_ht, i, e, node) {
211 		hash_del_rcu(&e->node);
212 		kfree_rcu(e, rcu);
213 	}
214 	spin_unlock(&ctx->napi_lock);
215 }
216 
217 /*
218  * io_napi_register() - Register napi with io-uring
219  * @ctx: pointer to io-uring context structure
220  * @arg: pointer to io_uring_napi structure
221  *
222  * Register napi in the io-uring context.
223  */
224 int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
225 {
226 	const struct io_uring_napi curr = {
227 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
228 		.prefer_busy_poll = ctx->napi_prefer_busy_poll
229 	};
230 	struct io_uring_napi napi;
231 
232 	if (ctx->flags & IORING_SETUP_IOPOLL)
233 		return -EINVAL;
234 	if (copy_from_user(&napi, arg, sizeof(napi)))
235 		return -EFAULT;
236 	if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
237 		return -EINVAL;
238 
239 	if (copy_to_user(arg, &curr, sizeof(curr)))
240 		return -EFAULT;
241 
242 	WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC);
243 	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
244 	WRITE_ONCE(ctx->napi_enabled, true);
245 	return 0;
246 }
247 
248 /*
249  * io_napi_unregister() - Unregister napi with io-uring
250  * @ctx: pointer to io-uring context structure
251  * @arg: pointer to io_uring_napi structure
252  *
253  * Unregister napi. If arg has been specified copy the busy poll timeout and
254  * prefer busy poll setting to the passed in structure.
255  */
256 int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
257 {
258 	const struct io_uring_napi curr = {
259 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
260 		.prefer_busy_poll = ctx->napi_prefer_busy_poll
261 	};
262 
263 	if (arg && copy_to_user(arg, &curr, sizeof(curr)))
264 		return -EFAULT;
265 
266 	WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
267 	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
268 	WRITE_ONCE(ctx->napi_enabled, false);
269 	return 0;
270 }
271 
272 /*
273  * __io_napi_adjust_timeout() - adjust busy loop timeout
274  * @ctx: pointer to io-uring context structure
275  * @iowq: pointer to io wait queue
276  * @ts: pointer to timespec or NULL
277  *
278  * Adjust the busy loop timeout according to timespec and busy poll timeout.
279  * If the specified NAPI timeout is bigger than the wait timeout, then adjust
280  * the NAPI timeout accordingly.
281  */
282 void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
283 			      ktime_t to_wait)
284 {
285 	ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
286 
287 	if (to_wait)
288 		poll_dt = min(poll_dt, to_wait);
289 
290 	iowq->napi_busy_poll_dt = poll_dt;
291 }
292 
293 /*
294  * __io_napi_busy_loop() - execute busy poll loop
295  * @ctx: pointer to io-uring context structure
296  * @iowq: pointer to io wait queue
297  *
298  * Execute the busy poll loop and merge the spliced off list.
299  */
300 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
301 {
302 	iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
303 
304 	if (!(ctx->flags & IORING_SETUP_SQPOLL))
305 		io_napi_blocking_busy_loop(ctx, iowq);
306 }
307 
308 /*
309  * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
310  * @ctx: pointer to io-uring context structure
311  *
312  * Splice of the napi list and execute the napi busy poll loop.
313  */
314 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
315 {
316 	bool is_stale = false;
317 
318 	if (!READ_ONCE(ctx->napi_busy_poll_dt))
319 		return 0;
320 	if (list_empty_careful(&ctx->napi_list))
321 		return 0;
322 
323 	rcu_read_lock();
324 	is_stale = __io_napi_do_busy_loop(ctx, NULL);
325 	rcu_read_unlock();
326 
327 	io_napi_remove_stale(ctx, is_stale);
328 	return 1;
329 }
330 
331 #endif
332