xref: /linux/io_uring/napi.c (revision a48395f22b8c8687ceb77ae3014a0eabcd4bf688)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "io_uring.h"
4 #include "napi.h"
5 
6 #ifdef CONFIG_NET_RX_BUSY_POLL
7 
8 /* Timeout for cleanout of stale entries. */
9 #define NAPI_TIMEOUT		(60 * SEC_CONVERSION)
10 
11 struct io_napi_entry {
12 	unsigned int		napi_id;
13 	struct list_head	list;
14 
15 	unsigned long		timeout;
16 	struct hlist_node	node;
17 
18 	struct rcu_head		rcu;
19 };
20 
21 static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
22 					       unsigned int napi_id)
23 {
24 	struct io_napi_entry *e;
25 
26 	hlist_for_each_entry_rcu(e, hash_list, node) {
27 		if (e->napi_id != napi_id)
28 			continue;
29 		e->timeout = jiffies + NAPI_TIMEOUT;
30 		return e;
31 	}
32 
33 	return NULL;
34 }
35 
36 static inline ktime_t net_to_ktime(unsigned long t)
37 {
38 	/* napi approximating usecs, reverse busy_loop_current_time */
39 	return ns_to_ktime(t << 10);
40 }
41 
42 void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
43 {
44 	struct hlist_head *hash_list;
45 	unsigned int napi_id;
46 	struct sock *sk;
47 	struct io_napi_entry *e;
48 
49 	sk = sock->sk;
50 	if (!sk)
51 		return;
52 
53 	napi_id = READ_ONCE(sk->sk_napi_id);
54 
55 	/* Non-NAPI IDs can be rejected. */
56 	if (napi_id < MIN_NAPI_ID)
57 		return;
58 
59 	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
60 
61 	rcu_read_lock();
62 	e = io_napi_hash_find(hash_list, napi_id);
63 	if (e) {
64 		e->timeout = jiffies + NAPI_TIMEOUT;
65 		rcu_read_unlock();
66 		return;
67 	}
68 	rcu_read_unlock();
69 
70 	e = kmalloc(sizeof(*e), GFP_NOWAIT);
71 	if (!e)
72 		return;
73 
74 	e->napi_id = napi_id;
75 	e->timeout = jiffies + NAPI_TIMEOUT;
76 
77 	spin_lock(&ctx->napi_lock);
78 	if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
79 		spin_unlock(&ctx->napi_lock);
80 		kfree(e);
81 		return;
82 	}
83 
84 	hlist_add_tail_rcu(&e->node, hash_list);
85 	list_add_tail(&e->list, &ctx->napi_list);
86 	spin_unlock(&ctx->napi_lock);
87 }
88 
89 static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
90 {
91 	struct io_napi_entry *e;
92 	unsigned int i;
93 
94 	spin_lock(&ctx->napi_lock);
95 	hash_for_each(ctx->napi_ht, i, e, node) {
96 		if (time_after(jiffies, e->timeout)) {
97 			list_del(&e->list);
98 			hash_del_rcu(&e->node);
99 			kfree_rcu(e, rcu);
100 		}
101 	}
102 	spin_unlock(&ctx->napi_lock);
103 }
104 
105 static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
106 {
107 	if (is_stale)
108 		__io_napi_remove_stale(ctx);
109 }
110 
111 static inline bool io_napi_busy_loop_timeout(ktime_t start_time,
112 					     ktime_t bp)
113 {
114 	if (bp) {
115 		ktime_t end_time = ktime_add(start_time, bp);
116 		ktime_t now = net_to_ktime(busy_loop_current_time());
117 
118 		return ktime_after(now, end_time);
119 	}
120 
121 	return true;
122 }
123 
124 static bool io_napi_busy_loop_should_end(void *data,
125 					 unsigned long start_time)
126 {
127 	struct io_wait_queue *iowq = data;
128 
129 	if (signal_pending(current))
130 		return true;
131 	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
132 		return true;
133 	if (io_napi_busy_loop_timeout(net_to_ktime(start_time),
134 				      iowq->napi_busy_poll_dt))
135 		return true;
136 
137 	return false;
138 }
139 
140 static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
141 				   void *loop_end_arg)
142 {
143 	struct io_napi_entry *e;
144 	bool (*loop_end)(void *, unsigned long) = NULL;
145 	bool is_stale = false;
146 
147 	if (loop_end_arg)
148 		loop_end = io_napi_busy_loop_should_end;
149 
150 	list_for_each_entry_rcu(e, &ctx->napi_list, list) {
151 		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
152 				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
153 
154 		if (time_after(jiffies, e->timeout))
155 			is_stale = true;
156 	}
157 
158 	return is_stale;
159 }
160 
161 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
162 				       struct io_wait_queue *iowq)
163 {
164 	unsigned long start_time = busy_loop_current_time();
165 	void *loop_end_arg = NULL;
166 	bool is_stale = false;
167 
168 	/* Singular lists use a different napi loop end check function and are
169 	 * only executed once.
170 	 */
171 	if (list_is_singular(&ctx->napi_list))
172 		loop_end_arg = iowq;
173 
174 	rcu_read_lock();
175 	do {
176 		is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
177 	} while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
178 	rcu_read_unlock();
179 
180 	io_napi_remove_stale(ctx, is_stale);
181 }
182 
183 /*
184  * io_napi_init() - Init napi settings
185  * @ctx: pointer to io-uring context structure
186  *
187  * Init napi settings in the io-uring context.
188  */
189 void io_napi_init(struct io_ring_ctx *ctx)
190 {
191 	u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC;
192 
193 	INIT_LIST_HEAD(&ctx->napi_list);
194 	spin_lock_init(&ctx->napi_lock);
195 	ctx->napi_prefer_busy_poll = false;
196 	ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
197 }
198 
199 /*
200  * io_napi_free() - Deallocate napi
201  * @ctx: pointer to io-uring context structure
202  *
203  * Free the napi list and the hash table in the io-uring context.
204  */
205 void io_napi_free(struct io_ring_ctx *ctx)
206 {
207 	struct io_napi_entry *e;
208 	LIST_HEAD(napi_list);
209 	unsigned int i;
210 
211 	spin_lock(&ctx->napi_lock);
212 	hash_for_each(ctx->napi_ht, i, e, node) {
213 		hash_del_rcu(&e->node);
214 		kfree_rcu(e, rcu);
215 	}
216 	spin_unlock(&ctx->napi_lock);
217 }
218 
219 /*
220  * io_napi_register() - Register napi with io-uring
221  * @ctx: pointer to io-uring context structure
222  * @arg: pointer to io_uring_napi structure
223  *
224  * Register napi in the io-uring context.
225  */
226 int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
227 {
228 	const struct io_uring_napi curr = {
229 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
230 		.prefer_busy_poll = ctx->napi_prefer_busy_poll
231 	};
232 	struct io_uring_napi napi;
233 
234 	if (ctx->flags & IORING_SETUP_IOPOLL)
235 		return -EINVAL;
236 	if (copy_from_user(&napi, arg, sizeof(napi)))
237 		return -EFAULT;
238 	if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
239 		return -EINVAL;
240 
241 	if (copy_to_user(arg, &curr, sizeof(curr)))
242 		return -EFAULT;
243 
244 	WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC);
245 	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
246 	WRITE_ONCE(ctx->napi_enabled, true);
247 	return 0;
248 }
249 
250 /*
251  * io_napi_unregister() - Unregister napi with io-uring
252  * @ctx: pointer to io-uring context structure
253  * @arg: pointer to io_uring_napi structure
254  *
255  * Unregister napi. If arg has been specified copy the busy poll timeout and
256  * prefer busy poll setting to the passed in structure.
257  */
258 int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
259 {
260 	const struct io_uring_napi curr = {
261 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
262 		.prefer_busy_poll = ctx->napi_prefer_busy_poll
263 	};
264 
265 	if (arg && copy_to_user(arg, &curr, sizeof(curr)))
266 		return -EFAULT;
267 
268 	WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
269 	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
270 	WRITE_ONCE(ctx->napi_enabled, false);
271 	return 0;
272 }
273 
274 /*
275  * __io_napi_adjust_timeout() - adjust busy loop timeout
276  * @ctx: pointer to io-uring context structure
277  * @iowq: pointer to io wait queue
278  * @ts: pointer to timespec or NULL
279  *
280  * Adjust the busy loop timeout according to timespec and busy poll timeout.
281  * If the specified NAPI timeout is bigger than the wait timeout, then adjust
282  * the NAPI timeout accordingly.
283  */
284 void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
285 			      ktime_t to_wait)
286 {
287 	ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
288 
289 	if (to_wait)
290 		poll_dt = min(poll_dt, to_wait);
291 
292 	iowq->napi_busy_poll_dt = poll_dt;
293 }
294 
295 /*
296  * __io_napi_busy_loop() - execute busy poll loop
297  * @ctx: pointer to io-uring context structure
298  * @iowq: pointer to io wait queue
299  *
300  * Execute the busy poll loop and merge the spliced off list.
301  */
302 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
303 {
304 	iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
305 
306 	if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled)
307 		io_napi_blocking_busy_loop(ctx, iowq);
308 }
309 
310 /*
311  * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
312  * @ctx: pointer to io-uring context structure
313  *
314  * Splice of the napi list and execute the napi busy poll loop.
315  */
316 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
317 {
318 	LIST_HEAD(napi_list);
319 	bool is_stale = false;
320 
321 	if (!READ_ONCE(ctx->napi_busy_poll_dt))
322 		return 0;
323 	if (list_empty_careful(&ctx->napi_list))
324 		return 0;
325 
326 	rcu_read_lock();
327 	is_stale = __io_napi_do_busy_loop(ctx, NULL);
328 	rcu_read_unlock();
329 
330 	io_napi_remove_stale(ctx, is_stale);
331 	return 1;
332 }
333 
334 #endif
335