xref: /linux/io_uring/napi.c (revision 663a917475530feff868a4f2bda286ea4171f420)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "io_uring.h"
4 #include "napi.h"
5 
6 #ifdef CONFIG_NET_RX_BUSY_POLL
7 
8 /* Timeout for cleanout of stale entries. */
9 #define NAPI_TIMEOUT		(60 * SEC_CONVERSION)
10 
11 struct io_napi_entry {
12 	unsigned int		napi_id;
13 	struct list_head	list;
14 
15 	unsigned long		timeout;
16 	struct hlist_node	node;
17 
18 	struct rcu_head		rcu;
19 };
20 
21 static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
22 					       unsigned int napi_id)
23 {
24 	struct io_napi_entry *e;
25 
26 	hlist_for_each_entry_rcu(e, hash_list, node) {
27 		if (e->napi_id != napi_id)
28 			continue;
29 		return e;
30 	}
31 
32 	return NULL;
33 }
34 
35 static inline ktime_t net_to_ktime(unsigned long t)
36 {
37 	/* napi approximating usecs, reverse busy_loop_current_time */
38 	return ns_to_ktime(t << 10);
39 }
40 
41 int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id)
42 {
43 	struct hlist_head *hash_list;
44 	struct io_napi_entry *e;
45 
46 	/* Non-NAPI IDs can be rejected. */
47 	if (napi_id < MIN_NAPI_ID)
48 		return -EINVAL;
49 
50 	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
51 
52 	scoped_guard(rcu) {
53 		e = io_napi_hash_find(hash_list, napi_id);
54 		if (e) {
55 			WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT);
56 			return -EEXIST;
57 		}
58 	}
59 
60 	e = kmalloc(sizeof(*e), GFP_NOWAIT);
61 	if (!e)
62 		return -ENOMEM;
63 
64 	e->napi_id = napi_id;
65 	e->timeout = jiffies + NAPI_TIMEOUT;
66 
67 	/*
68 	 * guard(spinlock) is not used to manually unlock it before calling
69 	 * kfree()
70 	 */
71 	spin_lock(&ctx->napi_lock);
72 	if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
73 		spin_unlock(&ctx->napi_lock);
74 		kfree(e);
75 		return -EEXIST;
76 	}
77 
78 	hlist_add_tail_rcu(&e->node, hash_list);
79 	list_add_tail_rcu(&e->list, &ctx->napi_list);
80 	spin_unlock(&ctx->napi_lock);
81 	return 0;
82 }
83 
84 static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id)
85 {
86 	struct hlist_head *hash_list;
87 	struct io_napi_entry *e;
88 
89 	/* Non-NAPI IDs can be rejected. */
90 	if (napi_id < MIN_NAPI_ID)
91 		return -EINVAL;
92 
93 	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
94 	guard(spinlock)(&ctx->napi_lock);
95 	e = io_napi_hash_find(hash_list, napi_id);
96 	if (!e)
97 		return -ENOENT;
98 
99 	list_del_rcu(&e->list);
100 	hash_del_rcu(&e->node);
101 	kfree_rcu(e, rcu);
102 	return 0;
103 }
104 
105 static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
106 {
107 	struct io_napi_entry *e;
108 
109 	guard(spinlock)(&ctx->napi_lock);
110 	/*
111 	 * list_for_each_entry_safe() is not required as long as:
112 	 * 1. list_del_rcu() does not reset the deleted node next pointer
113 	 * 2. kfree_rcu() delays the memory freeing until the next quiescent
114 	 *    state
115 	 */
116 	list_for_each_entry(e, &ctx->napi_list, list) {
117 		if (time_after(jiffies, READ_ONCE(e->timeout))) {
118 			list_del_rcu(&e->list);
119 			hash_del_rcu(&e->node);
120 			kfree_rcu(e, rcu);
121 		}
122 	}
123 }
124 
125 static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
126 {
127 	if (is_stale)
128 		__io_napi_remove_stale(ctx);
129 }
130 
131 static inline bool io_napi_busy_loop_timeout(ktime_t start_time,
132 					     ktime_t bp)
133 {
134 	if (bp) {
135 		ktime_t end_time = ktime_add(start_time, bp);
136 		ktime_t now = net_to_ktime(busy_loop_current_time());
137 
138 		return ktime_after(now, end_time);
139 	}
140 
141 	return true;
142 }
143 
144 static bool io_napi_busy_loop_should_end(void *data,
145 					 unsigned long start_time)
146 {
147 	struct io_wait_queue *iowq = data;
148 
149 	if (signal_pending(current))
150 		return true;
151 	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
152 		return true;
153 	if (io_napi_busy_loop_timeout(net_to_ktime(start_time),
154 				      iowq->napi_busy_poll_dt))
155 		return true;
156 
157 	return false;
158 }
159 
160 /*
161  * never report stale entries
162  */
163 static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx,
164 					 bool (*loop_end)(void *, unsigned long),
165 					 void *loop_end_arg)
166 {
167 	struct io_napi_entry *e;
168 
169 	list_for_each_entry_rcu(e, &ctx->napi_list, list)
170 		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
171 				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
172 	return false;
173 }
174 
175 static bool
176 dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx,
177 			      bool (*loop_end)(void *, unsigned long),
178 			      void *loop_end_arg)
179 {
180 	struct io_napi_entry *e;
181 	bool is_stale = false;
182 
183 	list_for_each_entry_rcu(e, &ctx->napi_list, list) {
184 		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
185 				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
186 
187 		if (time_after(jiffies, READ_ONCE(e->timeout)))
188 			is_stale = true;
189 	}
190 
191 	return is_stale;
192 }
193 
194 static inline bool
195 __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
196 		       bool (*loop_end)(void *, unsigned long),
197 		       void *loop_end_arg)
198 {
199 	if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC)
200 		return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
201 	return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
202 }
203 
204 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
205 				       struct io_wait_queue *iowq)
206 {
207 	unsigned long start_time = busy_loop_current_time();
208 	bool (*loop_end)(void *, unsigned long) = NULL;
209 	void *loop_end_arg = NULL;
210 	bool is_stale = false;
211 
212 	/* Singular lists use a different napi loop end check function and are
213 	 * only executed once.
214 	 */
215 	if (list_is_singular(&ctx->napi_list)) {
216 		loop_end = io_napi_busy_loop_should_end;
217 		loop_end_arg = iowq;
218 	}
219 
220 	scoped_guard(rcu) {
221 		do {
222 			is_stale = __io_napi_do_busy_loop(ctx, loop_end,
223 							  loop_end_arg);
224 		} while (!io_napi_busy_loop_should_end(iowq, start_time) &&
225 			 !loop_end_arg);
226 	}
227 
228 	io_napi_remove_stale(ctx, is_stale);
229 }
230 
231 /*
232  * io_napi_init() - Init napi settings
233  * @ctx: pointer to io-uring context structure
234  *
235  * Init napi settings in the io-uring context.
236  */
237 void io_napi_init(struct io_ring_ctx *ctx)
238 {
239 	u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC;
240 
241 	INIT_LIST_HEAD(&ctx->napi_list);
242 	spin_lock_init(&ctx->napi_lock);
243 	ctx->napi_prefer_busy_poll = false;
244 	ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
245 	ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE;
246 }
247 
248 /*
249  * io_napi_free() - Deallocate napi
250  * @ctx: pointer to io-uring context structure
251  *
252  * Free the napi list and the hash table in the io-uring context.
253  */
254 void io_napi_free(struct io_ring_ctx *ctx)
255 {
256 	struct io_napi_entry *e;
257 
258 	guard(spinlock)(&ctx->napi_lock);
259 	list_for_each_entry(e, &ctx->napi_list, list) {
260 		hash_del_rcu(&e->node);
261 		kfree_rcu(e, rcu);
262 	}
263 	INIT_LIST_HEAD_RCU(&ctx->napi_list);
264 }
265 
266 static int io_napi_register_napi(struct io_ring_ctx *ctx,
267 				 struct io_uring_napi *napi)
268 {
269 	switch (napi->op_param) {
270 	case IO_URING_NAPI_TRACKING_DYNAMIC:
271 	case IO_URING_NAPI_TRACKING_STATIC:
272 		break;
273 	default:
274 		return -EINVAL;
275 	}
276 	/* clean the napi list for new settings */
277 	io_napi_free(ctx);
278 	WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
279 	WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
280 	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
281 	return 0;
282 }
283 
284 /*
285  * io_napi_register() - Register napi with io-uring
286  * @ctx: pointer to io-uring context structure
287  * @arg: pointer to io_uring_napi structure
288  *
289  * Register napi in the io-uring context.
290  */
291 int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
292 {
293 	const struct io_uring_napi curr = {
294 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
295 		.prefer_busy_poll = ctx->napi_prefer_busy_poll,
296 		.op_param	  = ctx->napi_track_mode
297 	};
298 	struct io_uring_napi napi;
299 
300 	if (ctx->flags & IORING_SETUP_IOPOLL)
301 		return -EINVAL;
302 	if (copy_from_user(&napi, arg, sizeof(napi)))
303 		return -EFAULT;
304 	if (napi.pad[0] || napi.pad[1] || napi.resv)
305 		return -EINVAL;
306 
307 	if (copy_to_user(arg, &curr, sizeof(curr)))
308 		return -EFAULT;
309 
310 	switch (napi.opcode) {
311 	case IO_URING_NAPI_REGISTER_OP:
312 		return io_napi_register_napi(ctx, &napi);
313 	case IO_URING_NAPI_STATIC_ADD_ID:
314 		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
315 			return -EINVAL;
316 		return __io_napi_add_id(ctx, napi.op_param);
317 	case IO_URING_NAPI_STATIC_DEL_ID:
318 		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
319 			return -EINVAL;
320 		return __io_napi_del_id(ctx, napi.op_param);
321 	default:
322 		return -EINVAL;
323 	}
324 }
325 
326 /*
327  * io_napi_unregister() - Unregister napi with io-uring
328  * @ctx: pointer to io-uring context structure
329  * @arg: pointer to io_uring_napi structure
330  *
331  * Unregister napi. If arg has been specified copy the busy poll timeout and
332  * prefer busy poll setting to the passed in structure.
333  */
334 int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
335 {
336 	const struct io_uring_napi curr = {
337 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
338 		.prefer_busy_poll = ctx->napi_prefer_busy_poll
339 	};
340 
341 	if (arg && copy_to_user(arg, &curr, sizeof(curr)))
342 		return -EFAULT;
343 
344 	WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
345 	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
346 	WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
347 	return 0;
348 }
349 
350 /*
351  * __io_napi_busy_loop() - execute busy poll loop
352  * @ctx: pointer to io-uring context structure
353  * @iowq: pointer to io wait queue
354  *
355  * Execute the busy poll loop and merge the spliced off list.
356  */
357 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
358 {
359 	if (ctx->flags & IORING_SETUP_SQPOLL)
360 		return;
361 
362 	iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
363 	if (iowq->timeout != KTIME_MAX) {
364 		ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx));
365 
366 		iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt);
367 	}
368 
369 	iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
370 	io_napi_blocking_busy_loop(ctx, iowq);
371 }
372 
373 /*
374  * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
375  * @ctx: pointer to io-uring context structure
376  *
377  * Splice of the napi list and execute the napi busy poll loop.
378  */
379 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
380 {
381 	bool is_stale = false;
382 
383 	if (!READ_ONCE(ctx->napi_busy_poll_dt))
384 		return 0;
385 	if (list_empty_careful(&ctx->napi_list))
386 		return 0;
387 
388 	scoped_guard(rcu) {
389 		is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL);
390 	}
391 
392 	io_napi_remove_stale(ctx, is_stale);
393 	return 1;
394 }
395 
396 #endif
397