xref: /linux/io_uring/napi.c (revision 8be01e1280912a84f6bcf963ceed6c9f13ba1986)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "io_uring.h"
4 #include "napi.h"
5 
6 #ifdef CONFIG_NET_RX_BUSY_POLL
7 
8 /* Timeout for cleanout of stale entries. */
9 #define NAPI_TIMEOUT		(60 * SEC_CONVERSION)
10 
11 struct io_napi_entry {
12 	unsigned int		napi_id;
13 	struct list_head	list;
14 
15 	unsigned long		timeout;
16 	struct hlist_node	node;
17 
18 	struct rcu_head		rcu;
19 };
20 
io_napi_hash_find(struct hlist_head * hash_list,unsigned int napi_id)21 static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
22 					       unsigned int napi_id)
23 {
24 	struct io_napi_entry *e;
25 
26 	hlist_for_each_entry_rcu(e, hash_list, node) {
27 		if (e->napi_id != napi_id)
28 			continue;
29 		return e;
30 	}
31 
32 	return NULL;
33 }
34 
net_to_ktime(unsigned long t)35 static inline ktime_t net_to_ktime(unsigned long t)
36 {
37 	/* napi approximating usecs, reverse busy_loop_current_time */
38 	return ns_to_ktime(t << 10);
39 }
40 
__io_napi_add_id(struct io_ring_ctx * ctx,unsigned int napi_id,unsigned int mode)41 int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id,
42 		     unsigned int mode)
43 {
44 	struct hlist_head *hash_list;
45 	struct io_napi_entry *e;
46 
47 	/* Non-NAPI IDs can be rejected. */
48 	if (!napi_id_valid(napi_id))
49 		return -EINVAL;
50 
51 	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
52 
53 	scoped_guard(rcu) {
54 		e = io_napi_hash_find(hash_list, napi_id);
55 		if (e) {
56 			WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT);
57 			return -EEXIST;
58 		}
59 	}
60 
61 	e = kmalloc(sizeof(*e), GFP_NOWAIT);
62 	if (!e)
63 		return -ENOMEM;
64 
65 	e->napi_id = napi_id;
66 	e->timeout = jiffies + NAPI_TIMEOUT;
67 
68 	/*
69 	 * guard(spinlock) is not used to manually unlock it before calling
70 	 * kfree()
71 	 */
72 	spin_lock(&ctx->napi_lock);
73 	if (unlikely(READ_ONCE(ctx->napi_track_mode) != mode)) {
74 		spin_unlock(&ctx->napi_lock);
75 		kfree(e);
76 		return -EINVAL;
77 	}
78 	if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
79 		spin_unlock(&ctx->napi_lock);
80 		kfree(e);
81 		return -EEXIST;
82 	}
83 
84 	hlist_add_tail_rcu(&e->node, hash_list);
85 	list_add_tail_rcu(&e->list, &ctx->napi_list);
86 	spin_unlock(&ctx->napi_lock);
87 	return 0;
88 }
89 
__io_napi_del_id(struct io_ring_ctx * ctx,unsigned int napi_id)90 static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id)
91 {
92 	struct hlist_head *hash_list;
93 	struct io_napi_entry *e;
94 
95 	/* Non-NAPI IDs can be rejected. */
96 	if (!napi_id_valid(napi_id))
97 		return -EINVAL;
98 
99 	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
100 	guard(spinlock)(&ctx->napi_lock);
101 	e = io_napi_hash_find(hash_list, napi_id);
102 	if (!e)
103 		return -ENOENT;
104 
105 	list_del_rcu(&e->list);
106 	hash_del_rcu(&e->node);
107 	kfree_rcu(e, rcu);
108 	return 0;
109 }
110 
__io_napi_remove_stale(struct io_ring_ctx * ctx)111 static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
112 {
113 	struct io_napi_entry *e;
114 
115 	guard(spinlock)(&ctx->napi_lock);
116 	/*
117 	 * list_for_each_entry_safe() is not required as long as:
118 	 * 1. list_del_rcu() does not reset the deleted node next pointer
119 	 * 2. kfree_rcu() delays the memory freeing until the next quiescent
120 	 *    state
121 	 */
122 	list_for_each_entry(e, &ctx->napi_list, list) {
123 		if (time_after(jiffies, READ_ONCE(e->timeout))) {
124 			list_del_rcu(&e->list);
125 			hash_del_rcu(&e->node);
126 			kfree_rcu(e, rcu);
127 		}
128 	}
129 }
130 
io_napi_remove_stale(struct io_ring_ctx * ctx,bool is_stale)131 static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
132 {
133 	if (is_stale)
134 		__io_napi_remove_stale(ctx);
135 }
136 
io_napi_busy_loop_timeout(ktime_t start_time,ktime_t bp)137 static inline bool io_napi_busy_loop_timeout(ktime_t start_time,
138 					     ktime_t bp)
139 {
140 	if (bp) {
141 		ktime_t end_time = ktime_add(start_time, bp);
142 		ktime_t now = net_to_ktime(busy_loop_current_time());
143 
144 		return ktime_after(now, end_time);
145 	}
146 
147 	return true;
148 }
149 
io_napi_busy_loop_should_end(void * data,unsigned long start_time)150 static bool io_napi_busy_loop_should_end(void *data,
151 					 unsigned long start_time)
152 {
153 	struct io_wait_queue *iowq = data;
154 
155 	if (signal_pending(current))
156 		return true;
157 	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
158 		return true;
159 	if (io_napi_busy_loop_timeout(net_to_ktime(start_time),
160 				      iowq->napi_busy_poll_dt))
161 		return true;
162 
163 	return false;
164 }
165 
166 /*
167  * never report stale entries
168  */
static_tracking_do_busy_loop(struct io_ring_ctx * ctx,bool (* loop_end)(void *,unsigned long),void * loop_end_arg)169 static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx,
170 					 bool (*loop_end)(void *, unsigned long),
171 					 void *loop_end_arg)
172 {
173 	struct io_napi_entry *e;
174 
175 	list_for_each_entry_rcu(e, &ctx->napi_list, list)
176 		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
177 				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
178 	return false;
179 }
180 
181 static bool
dynamic_tracking_do_busy_loop(struct io_ring_ctx * ctx,bool (* loop_end)(void *,unsigned long),void * loop_end_arg)182 dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx,
183 			      bool (*loop_end)(void *, unsigned long),
184 			      void *loop_end_arg)
185 {
186 	struct io_napi_entry *e;
187 	bool is_stale = false;
188 
189 	list_for_each_entry_rcu(e, &ctx->napi_list, list) {
190 		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
191 				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
192 
193 		if (time_after(jiffies, READ_ONCE(e->timeout)))
194 			is_stale = true;
195 	}
196 
197 	return is_stale;
198 }
199 
200 static inline bool
__io_napi_do_busy_loop(struct io_ring_ctx * ctx,bool (* loop_end)(void *,unsigned long),void * loop_end_arg)201 __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
202 		       bool (*loop_end)(void *, unsigned long),
203 		       void *loop_end_arg)
204 {
205 	switch (READ_ONCE(ctx->napi_track_mode)) {
206 	case IO_URING_NAPI_TRACKING_STATIC:
207 		return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
208 	case IO_URING_NAPI_TRACKING_DYNAMIC:
209 		return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
210 	default:
211 		return false;
212 	}
213 }
214 
io_napi_blocking_busy_loop(struct io_ring_ctx * ctx,struct io_wait_queue * iowq)215 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
216 				       struct io_wait_queue *iowq)
217 {
218 	unsigned long start_time = busy_loop_current_time();
219 	bool (*loop_end)(void *, unsigned long) = NULL;
220 	void *loop_end_arg = NULL;
221 	bool is_stale = false;
222 
223 	/* Singular lists use a different napi loop end check function and are
224 	 * only executed once.
225 	 */
226 	if (list_is_singular(&ctx->napi_list)) {
227 		loop_end = io_napi_busy_loop_should_end;
228 		loop_end_arg = iowq;
229 	}
230 
231 	scoped_guard(rcu) {
232 		do {
233 			is_stale = __io_napi_do_busy_loop(ctx, loop_end,
234 							  loop_end_arg);
235 		} while (!io_napi_busy_loop_should_end(iowq, start_time) &&
236 			 !loop_end_arg);
237 	}
238 
239 	io_napi_remove_stale(ctx, is_stale);
240 }
241 
242 /*
243  * io_napi_init() - Init napi settings
244  * @ctx: pointer to io-uring context structure
245  *
246  * Init napi settings in the io-uring context.
247  */
io_napi_init(struct io_ring_ctx * ctx)248 void io_napi_init(struct io_ring_ctx *ctx)
249 {
250 	u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC;
251 
252 	INIT_LIST_HEAD(&ctx->napi_list);
253 	spin_lock_init(&ctx->napi_lock);
254 	ctx->napi_prefer_busy_poll = false;
255 	ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
256 	ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE;
257 }
258 
259 /*
260  * io_napi_free() - Deallocate napi
261  * @ctx: pointer to io-uring context structure
262  *
263  * Free the napi list and the hash table in the io-uring context.
264  */
io_napi_free(struct io_ring_ctx * ctx)265 void io_napi_free(struct io_ring_ctx *ctx)
266 {
267 	struct io_napi_entry *e;
268 
269 	guard(spinlock)(&ctx->napi_lock);
270 	list_for_each_entry(e, &ctx->napi_list, list) {
271 		hash_del_rcu(&e->node);
272 		kfree_rcu(e, rcu);
273 	}
274 	INIT_LIST_HEAD_RCU(&ctx->napi_list);
275 }
276 
io_napi_register_napi(struct io_ring_ctx * ctx,struct io_uring_napi * napi)277 static int io_napi_register_napi(struct io_ring_ctx *ctx,
278 				 struct io_uring_napi *napi)
279 {
280 	switch (napi->op_param) {
281 	case IO_URING_NAPI_TRACKING_DYNAMIC:
282 	case IO_URING_NAPI_TRACKING_STATIC:
283 		break;
284 	default:
285 		return -EINVAL;
286 	}
287 	WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
288 	io_napi_free(ctx);
289 	/* cap NAPI at 10 msec of spin time */
290 	napi->busy_poll_to = min(10000, napi->busy_poll_to);
291 	WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
292 	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
293 	WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
294 	return 0;
295 }
296 
297 /*
298  * io_napi_register() - Register napi with io-uring
299  * @ctx: pointer to io-uring context structure
300  * @arg: pointer to io_uring_napi structure
301  *
302  * Register napi in the io-uring context.
303  */
io_register_napi(struct io_ring_ctx * ctx,void __user * arg)304 int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
305 {
306 	const struct io_uring_napi curr = {
307 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
308 		.prefer_busy_poll = ctx->napi_prefer_busy_poll,
309 		.op_param	  = ctx->napi_track_mode
310 	};
311 	struct io_uring_napi napi;
312 
313 	if (ctx->flags & IORING_SETUP_IOPOLL)
314 		return -EINVAL;
315 	if (copy_from_user(&napi, arg, sizeof(napi)))
316 		return -EFAULT;
317 	if (napi.pad[0] || napi.pad[1] || napi.resv)
318 		return -EINVAL;
319 
320 	if (copy_to_user(arg, &curr, sizeof(curr)))
321 		return -EFAULT;
322 
323 	switch (napi.opcode) {
324 	case IO_URING_NAPI_REGISTER_OP:
325 		return io_napi_register_napi(ctx, &napi);
326 	case IO_URING_NAPI_STATIC_ADD_ID:
327 		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
328 			return -EINVAL;
329 		return __io_napi_add_id(ctx, napi.op_param,
330 					IO_URING_NAPI_TRACKING_STATIC);
331 	case IO_URING_NAPI_STATIC_DEL_ID:
332 		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
333 			return -EINVAL;
334 		return __io_napi_del_id(ctx, napi.op_param);
335 	default:
336 		return -EINVAL;
337 	}
338 }
339 
340 /*
341  * io_napi_unregister() - Unregister napi with io-uring
342  * @ctx: pointer to io-uring context structure
343  * @arg: pointer to io_uring_napi structure
344  *
345  * Unregister napi. If arg has been specified copy the busy poll timeout and
346  * prefer busy poll setting to the passed in structure.
347  */
io_unregister_napi(struct io_ring_ctx * ctx,void __user * arg)348 int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
349 {
350 	const struct io_uring_napi curr = {
351 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
352 		.prefer_busy_poll = ctx->napi_prefer_busy_poll
353 	};
354 
355 	if (arg && copy_to_user(arg, &curr, sizeof(curr)))
356 		return -EFAULT;
357 
358 	WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
359 	WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
360 	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
361 	io_napi_free(ctx);
362 	return 0;
363 }
364 
365 /*
366  * __io_napi_busy_loop() - execute busy poll loop
367  * @ctx: pointer to io-uring context structure
368  * @iowq: pointer to io wait queue
369  *
370  * Execute the busy poll loop and merge the spliced off list.
371  */
__io_napi_busy_loop(struct io_ring_ctx * ctx,struct io_wait_queue * iowq)372 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
373 {
374 	if (ctx->flags & IORING_SETUP_SQPOLL)
375 		return;
376 
377 	iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
378 	if (iowq->timeout != KTIME_MAX) {
379 		ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx));
380 
381 		iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt);
382 	}
383 
384 	iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
385 	io_napi_blocking_busy_loop(ctx, iowq);
386 }
387 
388 /*
389  * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
390  * @ctx: pointer to io-uring context structure
391  *
392  * Splice of the napi list and execute the napi busy poll loop.
393  */
io_napi_sqpoll_busy_poll(struct io_ring_ctx * ctx)394 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
395 {
396 	bool is_stale = false;
397 
398 	if (!READ_ONCE(ctx->napi_busy_poll_dt))
399 		return 0;
400 	if (list_empty_careful(&ctx->napi_list))
401 		return 0;
402 
403 	scoped_guard(rcu) {
404 		is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL);
405 	}
406 
407 	io_napi_remove_stale(ctx, is_stale);
408 	return 1;
409 }
410 
411 #endif
412