1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/file.h>
5 #include <linux/mm.h>
6 #include <linux/slab.h>
7 #include <linux/nospec.h>
8 #include <linux/io_uring.h>
9
10 #include <uapi/linux/io_uring.h>
11
12 #include "io_uring.h"
13 #include "tctx.h"
14 #include "bpf_filter.h"
15
io_init_wq_offload(struct io_ring_ctx * ctx,struct task_struct * task)16 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
17 struct task_struct *task)
18 {
19 struct io_wq_hash *hash;
20 struct io_wq_data data;
21 unsigned int concurrency;
22
23 mutex_lock(&ctx->uring_lock);
24 hash = ctx->hash_map;
25 if (!hash) {
26 hash = kzalloc_obj(*hash);
27 if (!hash) {
28 mutex_unlock(&ctx->uring_lock);
29 return ERR_PTR(-ENOMEM);
30 }
31 refcount_set(&hash->refs, 1);
32 init_waitqueue_head(&hash->wait);
33 ctx->hash_map = hash;
34 }
35 mutex_unlock(&ctx->uring_lock);
36
37 data.hash = hash;
38 data.task = task;
39
40 /* Do QD, or 4 * CPUS, whatever is smallest */
41 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
42
43 return io_wq_create(concurrency, &data);
44 }
45
__io_uring_free(struct task_struct * tsk)46 void __io_uring_free(struct task_struct *tsk)
47 {
48 struct io_uring_task *tctx = tsk->io_uring;
49 struct io_tctx_node *node;
50 unsigned long index;
51
52 /*
53 * Fault injection forcing allocation errors in the xa_store() path
54 * can lead to xa_empty() returning false, even though no actual
55 * node is stored in the xarray. Until that gets sorted out, attempt
56 * an iteration here and warn if any entries are found.
57 */
58 if (tctx) {
59 xa_for_each(&tctx->xa, index, node) {
60 WARN_ON_ONCE(1);
61 break;
62 }
63 WARN_ON_ONCE(tctx->io_wq);
64 WARN_ON_ONCE(tctx->cached_refs);
65
66 percpu_counter_destroy(&tctx->inflight);
67 kfree(tctx);
68 tsk->io_uring = NULL;
69 }
70 if (tsk->io_uring_restrict) {
71 io_put_bpf_filters(tsk->io_uring_restrict);
72 kfree(tsk->io_uring_restrict);
73 tsk->io_uring_restrict = NULL;
74 }
75 }
76
io_uring_alloc_task_context(struct task_struct * task,struct io_ring_ctx * ctx)77 __cold int io_uring_alloc_task_context(struct task_struct *task,
78 struct io_ring_ctx *ctx)
79 {
80 struct io_uring_task *tctx;
81 int ret;
82
83 tctx = kzalloc_obj(*tctx);
84 if (unlikely(!tctx))
85 return -ENOMEM;
86
87 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
88 if (unlikely(ret)) {
89 kfree(tctx);
90 return ret;
91 }
92
93 tctx->io_wq = io_init_wq_offload(ctx, task);
94 if (IS_ERR(tctx->io_wq)) {
95 ret = PTR_ERR(tctx->io_wq);
96 percpu_counter_destroy(&tctx->inflight);
97 kfree(tctx);
98 return ret;
99 }
100
101 tctx->task = task;
102 xa_init(&tctx->xa);
103 init_waitqueue_head(&tctx->wait);
104 atomic_set(&tctx->in_cancel, 0);
105 atomic_set(&tctx->inflight_tracked, 0);
106 task->io_uring = tctx;
107 init_llist_head(&tctx->task_list);
108 init_task_work(&tctx->task_work, tctx_task_work);
109 return 0;
110 }
111
__io_uring_add_tctx_node(struct io_ring_ctx * ctx)112 int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
113 {
114 struct io_uring_task *tctx = current->io_uring;
115 struct io_tctx_node *node;
116 int ret;
117
118 if (unlikely(!tctx)) {
119 ret = io_uring_alloc_task_context(current, ctx);
120 if (unlikely(ret))
121 return ret;
122
123 tctx = current->io_uring;
124 if (ctx->iowq_limits_set) {
125 unsigned int limits[2] = { ctx->iowq_limits[0],
126 ctx->iowq_limits[1], };
127
128 ret = io_wq_max_workers(tctx->io_wq, limits);
129 if (ret)
130 return ret;
131 }
132 }
133
134 /*
135 * Re-activate io-wq keepalive on any new io_uring usage. The wq may have
136 * been marked for idle-exit when the task temporarily had no active
137 * io_uring instances.
138 */
139 if (tctx->io_wq)
140 io_wq_set_exit_on_idle(tctx->io_wq, false);
141 if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
142 node = kmalloc_obj(*node);
143 if (!node)
144 return -ENOMEM;
145 node->ctx = ctx;
146 node->task = current;
147
148 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
149 node, GFP_KERNEL));
150 if (ret) {
151 kfree(node);
152 return ret;
153 }
154
155 mutex_lock(&ctx->tctx_lock);
156 list_add(&node->ctx_node, &ctx->tctx_list);
157 mutex_unlock(&ctx->tctx_lock);
158 }
159 return 0;
160 }
161
__io_uring_add_tctx_node_from_submit(struct io_ring_ctx * ctx)162 int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx)
163 {
164 int ret;
165
166 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
167 && ctx->submitter_task != current)
168 return -EEXIST;
169
170 ret = __io_uring_add_tctx_node(ctx);
171 if (ret)
172 return ret;
173
174 current->io_uring->last = ctx;
175 return 0;
176 }
177
178 /*
179 * Remove this io_uring_file -> task mapping.
180 */
io_uring_del_tctx_node(unsigned long index)181 __cold void io_uring_del_tctx_node(unsigned long index)
182 {
183 struct io_uring_task *tctx = current->io_uring;
184 struct io_tctx_node *node;
185
186 if (!tctx)
187 return;
188 node = xa_erase(&tctx->xa, index);
189 if (!node)
190 return;
191
192 WARN_ON_ONCE(current != node->task);
193 WARN_ON_ONCE(list_empty(&node->ctx_node));
194
195 mutex_lock(&node->ctx->tctx_lock);
196 list_del(&node->ctx_node);
197 mutex_unlock(&node->ctx->tctx_lock);
198
199 if (tctx->last == node->ctx)
200 tctx->last = NULL;
201 kfree(node);
202
203 if (xa_empty(&tctx->xa) && tctx->io_wq)
204 io_wq_set_exit_on_idle(tctx->io_wq, true);
205 }
206
io_uring_clean_tctx(struct io_uring_task * tctx)207 __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
208 {
209 struct io_wq *wq = tctx->io_wq;
210 struct io_tctx_node *node;
211 unsigned long index;
212
213 xa_for_each(&tctx->xa, index, node) {
214 io_uring_del_tctx_node(index);
215 cond_resched();
216 }
217 if (wq) {
218 /*
219 * Must be after io_uring_del_tctx_node() (removes nodes under
220 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
221 */
222 io_wq_put_and_exit(wq);
223 tctx->io_wq = NULL;
224 }
225 }
226
io_uring_unreg_ringfd(void)227 void io_uring_unreg_ringfd(void)
228 {
229 struct io_uring_task *tctx = current->io_uring;
230 int i;
231
232 for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
233 if (tctx->registered_rings[i]) {
234 fput(tctx->registered_rings[i]);
235 tctx->registered_rings[i] = NULL;
236 }
237 }
238 }
239
io_ring_add_registered_file(struct io_uring_task * tctx,struct file * file,int start,int end)240 int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
241 int start, int end)
242 {
243 int offset, idx;
244 for (offset = start; offset < end; offset++) {
245 idx = array_index_nospec(offset, IO_RINGFD_REG_MAX);
246 if (tctx->registered_rings[idx])
247 continue;
248
249 tctx->registered_rings[idx] = file;
250 return idx;
251 }
252 return -EBUSY;
253 }
254
io_ring_add_registered_fd(struct io_uring_task * tctx,int fd,int start,int end)255 static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
256 int start, int end)
257 {
258 struct file *file;
259 int offset;
260
261 file = fget(fd);
262 if (!file) {
263 return -EBADF;
264 } else if (!io_is_uring_fops(file)) {
265 fput(file);
266 return -EOPNOTSUPP;
267 }
268 offset = io_ring_add_registered_file(tctx, file, start, end);
269 if (offset < 0)
270 fput(file);
271 return offset;
272 }
273
274 /*
275 * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
276 * invocation. User passes in an array of struct io_uring_rsrc_update
277 * with ->data set to the ring_fd, and ->offset given for the desired
278 * index. If no index is desired, application may set ->offset == -1U
279 * and we'll find an available index. Returns number of entries
280 * successfully processed, or < 0 on error if none were processed.
281 */
io_ringfd_register(struct io_ring_ctx * ctx,void __user * __arg,unsigned nr_args)282 int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
283 unsigned nr_args)
284 {
285 struct io_uring_rsrc_update __user *arg = __arg;
286 struct io_uring_rsrc_update reg;
287 struct io_uring_task *tctx;
288 int ret, i;
289
290 if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
291 return -EINVAL;
292
293 mutex_unlock(&ctx->uring_lock);
294 ret = __io_uring_add_tctx_node(ctx);
295 mutex_lock(&ctx->uring_lock);
296 if (ret)
297 return ret;
298
299 tctx = current->io_uring;
300 for (i = 0; i < nr_args; i++) {
301 int start, end;
302
303 if (copy_from_user(®, &arg[i], sizeof(reg))) {
304 ret = -EFAULT;
305 break;
306 }
307
308 if (reg.resv) {
309 ret = -EINVAL;
310 break;
311 }
312
313 if (reg.offset == -1U) {
314 start = 0;
315 end = IO_RINGFD_REG_MAX;
316 } else {
317 if (reg.offset >= IO_RINGFD_REG_MAX) {
318 ret = -EINVAL;
319 break;
320 }
321 start = reg.offset;
322 end = start + 1;
323 }
324
325 ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
326 if (ret < 0)
327 break;
328
329 reg.offset = ret;
330 if (copy_to_user(&arg[i], ®, sizeof(reg))) {
331 fput(tctx->registered_rings[reg.offset]);
332 tctx->registered_rings[reg.offset] = NULL;
333 ret = -EFAULT;
334 break;
335 }
336 }
337
338 return i ? i : ret;
339 }
340
io_ringfd_unregister(struct io_ring_ctx * ctx,void __user * __arg,unsigned nr_args)341 int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
342 unsigned nr_args)
343 {
344 struct io_uring_rsrc_update __user *arg = __arg;
345 struct io_uring_task *tctx = current->io_uring;
346 struct io_uring_rsrc_update reg;
347 int ret = 0, i;
348
349 if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
350 return -EINVAL;
351 if (!tctx)
352 return 0;
353
354 for (i = 0; i < nr_args; i++) {
355 if (copy_from_user(®, &arg[i], sizeof(reg))) {
356 ret = -EFAULT;
357 break;
358 }
359 if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
360 ret = -EINVAL;
361 break;
362 }
363
364 reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
365 if (tctx->registered_rings[reg.offset]) {
366 fput(tctx->registered_rings[reg.offset]);
367 tctx->registered_rings[reg.offset] = NULL;
368 }
369 }
370
371 return i ? i : ret;
372 }
373
__io_uring_fork(struct task_struct * tsk)374 int __io_uring_fork(struct task_struct *tsk)
375 {
376 struct io_restriction *res, *src = tsk->io_uring_restrict;
377
378 /* Don't leave it dangling on error */
379 tsk->io_uring_restrict = NULL;
380
381 res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
382 if (!res)
383 return -ENOMEM;
384
385 tsk->io_uring_restrict = res;
386 io_restriction_clone(res, src);
387 return 0;
388 }
389