1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "io_uring.h"
4 #include "napi.h"
5
6 #ifdef CONFIG_NET_RX_BUSY_POLL
7
8 /* Timeout for cleanout of stale entries. */
9 #define NAPI_TIMEOUT (60 * SEC_CONVERSION)
10
11 struct io_napi_entry {
12 unsigned int napi_id;
13 struct list_head list;
14
15 unsigned long timeout;
16 struct hlist_node node;
17
18 struct rcu_head rcu;
19 };
20
io_napi_hash_find(struct hlist_head * hash_list,unsigned int napi_id)21 static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
22 unsigned int napi_id)
23 {
24 struct io_napi_entry *e;
25
26 hlist_for_each_entry_rcu(e, hash_list, node) {
27 if (e->napi_id != napi_id)
28 continue;
29 return e;
30 }
31
32 return NULL;
33 }
34
net_to_ktime(unsigned long t)35 static inline ktime_t net_to_ktime(unsigned long t)
36 {
37 /* napi approximating usecs, reverse busy_loop_current_time */
38 return ns_to_ktime(t << 10);
39 }
40
__io_napi_add_id(struct io_ring_ctx * ctx,unsigned int napi_id,unsigned int mode)41 int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id,
42 unsigned int mode)
43 {
44 struct hlist_head *hash_list;
45 struct io_napi_entry *e;
46
47 /* Non-NAPI IDs can be rejected. */
48 if (!napi_id_valid(napi_id))
49 return -EINVAL;
50
51 hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
52
53 scoped_guard(rcu) {
54 e = io_napi_hash_find(hash_list, napi_id);
55 if (e) {
56 WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT);
57 return -EEXIST;
58 }
59 }
60
61 e = kmalloc(sizeof(*e), GFP_NOWAIT);
62 if (!e)
63 return -ENOMEM;
64
65 e->napi_id = napi_id;
66 e->timeout = jiffies + NAPI_TIMEOUT;
67
68 /*
69 * guard(spinlock) is not used to manually unlock it before calling
70 * kfree()
71 */
72 spin_lock(&ctx->napi_lock);
73 if (unlikely(READ_ONCE(ctx->napi_track_mode) != mode)) {
74 spin_unlock(&ctx->napi_lock);
75 kfree(e);
76 return -EINVAL;
77 }
78 if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
79 spin_unlock(&ctx->napi_lock);
80 kfree(e);
81 return -EEXIST;
82 }
83
84 hlist_add_tail_rcu(&e->node, hash_list);
85 list_add_tail_rcu(&e->list, &ctx->napi_list);
86 spin_unlock(&ctx->napi_lock);
87 return 0;
88 }
89
__io_napi_del_id(struct io_ring_ctx * ctx,unsigned int napi_id)90 static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id)
91 {
92 struct hlist_head *hash_list;
93 struct io_napi_entry *e;
94
95 /* Non-NAPI IDs can be rejected. */
96 if (!napi_id_valid(napi_id))
97 return -EINVAL;
98
99 hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
100 guard(spinlock)(&ctx->napi_lock);
101 e = io_napi_hash_find(hash_list, napi_id);
102 if (!e)
103 return -ENOENT;
104
105 list_del_rcu(&e->list);
106 hash_del_rcu(&e->node);
107 kfree_rcu(e, rcu);
108 return 0;
109 }
110
__io_napi_remove_stale(struct io_ring_ctx * ctx)111 static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
112 {
113 struct io_napi_entry *e;
114
115 guard(spinlock)(&ctx->napi_lock);
116 /*
117 * list_for_each_entry_safe() is not required as long as:
118 * 1. list_del_rcu() does not reset the deleted node next pointer
119 * 2. kfree_rcu() delays the memory freeing until the next quiescent
120 * state
121 */
122 list_for_each_entry(e, &ctx->napi_list, list) {
123 if (time_after(jiffies, READ_ONCE(e->timeout))) {
124 list_del_rcu(&e->list);
125 hash_del_rcu(&e->node);
126 kfree_rcu(e, rcu);
127 }
128 }
129 }
130
io_napi_remove_stale(struct io_ring_ctx * ctx,bool is_stale)131 static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
132 {
133 if (is_stale)
134 __io_napi_remove_stale(ctx);
135 }
136
io_napi_busy_loop_timeout(ktime_t start_time,ktime_t bp)137 static inline bool io_napi_busy_loop_timeout(ktime_t start_time,
138 ktime_t bp)
139 {
140 if (bp) {
141 ktime_t end_time = ktime_add(start_time, bp);
142 ktime_t now = net_to_ktime(busy_loop_current_time());
143
144 return ktime_after(now, end_time);
145 }
146
147 return true;
148 }
149
io_napi_busy_loop_should_end(void * data,unsigned long start_time)150 static bool io_napi_busy_loop_should_end(void *data,
151 unsigned long start_time)
152 {
153 struct io_wait_queue *iowq = data;
154
155 if (signal_pending(current))
156 return true;
157 if (io_should_wake(iowq) || io_has_work(iowq->ctx))
158 return true;
159 if (io_napi_busy_loop_timeout(net_to_ktime(start_time),
160 iowq->napi_busy_poll_dt))
161 return true;
162
163 return false;
164 }
165
166 /*
167 * never report stale entries
168 */
static_tracking_do_busy_loop(struct io_ring_ctx * ctx,bool (* loop_end)(void *,unsigned long),void * loop_end_arg)169 static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx,
170 bool (*loop_end)(void *, unsigned long),
171 void *loop_end_arg)
172 {
173 struct io_napi_entry *e;
174
175 list_for_each_entry_rcu(e, &ctx->napi_list, list)
176 napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
177 ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
178 return false;
179 }
180
181 static bool
dynamic_tracking_do_busy_loop(struct io_ring_ctx * ctx,bool (* loop_end)(void *,unsigned long),void * loop_end_arg)182 dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx,
183 bool (*loop_end)(void *, unsigned long),
184 void *loop_end_arg)
185 {
186 struct io_napi_entry *e;
187 bool is_stale = false;
188
189 list_for_each_entry_rcu(e, &ctx->napi_list, list) {
190 napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
191 ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
192
193 if (time_after(jiffies, READ_ONCE(e->timeout)))
194 is_stale = true;
195 }
196
197 return is_stale;
198 }
199
200 static inline bool
__io_napi_do_busy_loop(struct io_ring_ctx * ctx,bool (* loop_end)(void *,unsigned long),void * loop_end_arg)201 __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
202 bool (*loop_end)(void *, unsigned long),
203 void *loop_end_arg)
204 {
205 switch (READ_ONCE(ctx->napi_track_mode)) {
206 case IO_URING_NAPI_TRACKING_STATIC:
207 return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
208 case IO_URING_NAPI_TRACKING_DYNAMIC:
209 return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
210 default:
211 return false;
212 }
213 }
214
io_napi_blocking_busy_loop(struct io_ring_ctx * ctx,struct io_wait_queue * iowq)215 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
216 struct io_wait_queue *iowq)
217 {
218 unsigned long start_time = busy_loop_current_time();
219 bool (*loop_end)(void *, unsigned long) = NULL;
220 void *loop_end_arg = NULL;
221 bool is_stale = false;
222
223 /* Singular lists use a different napi loop end check function and are
224 * only executed once.
225 */
226 if (list_is_singular(&ctx->napi_list)) {
227 loop_end = io_napi_busy_loop_should_end;
228 loop_end_arg = iowq;
229 }
230
231 scoped_guard(rcu) {
232 do {
233 is_stale = __io_napi_do_busy_loop(ctx, loop_end,
234 loop_end_arg);
235 } while (!io_napi_busy_loop_should_end(iowq, start_time) &&
236 !loop_end_arg);
237 }
238
239 io_napi_remove_stale(ctx, is_stale);
240 }
241
242 /*
243 * io_napi_init() - Init napi settings
244 * @ctx: pointer to io-uring context structure
245 *
246 * Init napi settings in the io-uring context.
247 */
io_napi_init(struct io_ring_ctx * ctx)248 void io_napi_init(struct io_ring_ctx *ctx)
249 {
250 u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC;
251
252 INIT_LIST_HEAD(&ctx->napi_list);
253 spin_lock_init(&ctx->napi_lock);
254 ctx->napi_prefer_busy_poll = false;
255 ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
256 ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE;
257 }
258
259 /*
260 * io_napi_free() - Deallocate napi
261 * @ctx: pointer to io-uring context structure
262 *
263 * Free the napi list and the hash table in the io-uring context.
264 */
io_napi_free(struct io_ring_ctx * ctx)265 void io_napi_free(struct io_ring_ctx *ctx)
266 {
267 struct io_napi_entry *e;
268
269 guard(spinlock)(&ctx->napi_lock);
270 list_for_each_entry(e, &ctx->napi_list, list) {
271 hash_del_rcu(&e->node);
272 kfree_rcu(e, rcu);
273 }
274 INIT_LIST_HEAD_RCU(&ctx->napi_list);
275 }
276
io_napi_register_napi(struct io_ring_ctx * ctx,struct io_uring_napi * napi)277 static int io_napi_register_napi(struct io_ring_ctx *ctx,
278 struct io_uring_napi *napi)
279 {
280 switch (napi->op_param) {
281 case IO_URING_NAPI_TRACKING_DYNAMIC:
282 case IO_URING_NAPI_TRACKING_STATIC:
283 break;
284 default:
285 return -EINVAL;
286 }
287 WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
288 io_napi_free(ctx);
289 /* cap NAPI at 10 msec of spin time */
290 napi->busy_poll_to = min(10000, napi->busy_poll_to);
291 WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
292 WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
293 WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
294 return 0;
295 }
296
297 /*
298 * io_napi_register() - Register napi with io-uring
299 * @ctx: pointer to io-uring context structure
300 * @arg: pointer to io_uring_napi structure
301 *
302 * Register napi in the io-uring context.
303 */
io_register_napi(struct io_ring_ctx * ctx,void __user * arg)304 int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
305 {
306 const struct io_uring_napi curr = {
307 .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt),
308 .prefer_busy_poll = ctx->napi_prefer_busy_poll,
309 .op_param = ctx->napi_track_mode
310 };
311 struct io_uring_napi napi;
312
313 if (ctx->flags & IORING_SETUP_IOPOLL)
314 return -EINVAL;
315 if (copy_from_user(&napi, arg, sizeof(napi)))
316 return -EFAULT;
317 if (napi.pad[0] || napi.pad[1] || napi.resv)
318 return -EINVAL;
319
320 if (copy_to_user(arg, &curr, sizeof(curr)))
321 return -EFAULT;
322
323 switch (napi.opcode) {
324 case IO_URING_NAPI_REGISTER_OP:
325 return io_napi_register_napi(ctx, &napi);
326 case IO_URING_NAPI_STATIC_ADD_ID:
327 if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
328 return -EINVAL;
329 return __io_napi_add_id(ctx, napi.op_param,
330 IO_URING_NAPI_TRACKING_STATIC);
331 case IO_URING_NAPI_STATIC_DEL_ID:
332 if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
333 return -EINVAL;
334 return __io_napi_del_id(ctx, napi.op_param);
335 default:
336 return -EINVAL;
337 }
338 }
339
340 /*
341 * io_napi_unregister() - Unregister napi with io-uring
342 * @ctx: pointer to io-uring context structure
343 * @arg: pointer to io_uring_napi structure
344 *
345 * Unregister napi. If arg has been specified copy the busy poll timeout and
346 * prefer busy poll setting to the passed in structure.
347 */
io_unregister_napi(struct io_ring_ctx * ctx,void __user * arg)348 int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
349 {
350 const struct io_uring_napi curr = {
351 .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt),
352 .prefer_busy_poll = ctx->napi_prefer_busy_poll
353 };
354
355 if (arg && copy_to_user(arg, &curr, sizeof(curr)))
356 return -EFAULT;
357
358 WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
359 WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
360 WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
361 io_napi_free(ctx);
362 return 0;
363 }
364
365 /*
366 * __io_napi_busy_loop() - execute busy poll loop
367 * @ctx: pointer to io-uring context structure
368 * @iowq: pointer to io wait queue
369 *
370 * Execute the busy poll loop and merge the spliced off list.
371 */
__io_napi_busy_loop(struct io_ring_ctx * ctx,struct io_wait_queue * iowq)372 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
373 {
374 if (ctx->flags & IORING_SETUP_SQPOLL)
375 return;
376
377 iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
378 if (iowq->timeout != KTIME_MAX) {
379 ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx));
380
381 iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt);
382 }
383
384 iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
385 io_napi_blocking_busy_loop(ctx, iowq);
386 }
387
388 /*
389 * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
390 * @ctx: pointer to io-uring context structure
391 *
392 * Splice of the napi list and execute the napi busy poll loop.
393 */
io_napi_sqpoll_busy_poll(struct io_ring_ctx * ctx)394 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
395 {
396 bool is_stale = false;
397
398 if (!READ_ONCE(ctx->napi_busy_poll_dt))
399 return 0;
400 if (list_empty_careful(&ctx->napi_list))
401 return 0;
402
403 scoped_guard(rcu) {
404 is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL);
405 }
406
407 io_napi_remove_stale(ctx, is_stale);
408 return 1;
409 }
410
411 #endif
412