xref: /linux/io_uring/register.c (revision f879306834818ebd1722a4372079610cdd466fec)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30 #include "eventfd.h"
31 
32 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
33 				 IORING_REGISTER_LAST + IORING_OP_LAST)
34 
35 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
36 			   unsigned nr_args)
37 {
38 	struct io_uring_probe *p;
39 	size_t size;
40 	int i, ret;
41 
42 	if (nr_args > IORING_OP_LAST)
43 		nr_args = IORING_OP_LAST;
44 
45 	size = struct_size(p, ops, nr_args);
46 	p = kzalloc(size, GFP_KERNEL);
47 	if (!p)
48 		return -ENOMEM;
49 
50 	ret = -EFAULT;
51 	if (copy_from_user(p, arg, size))
52 		goto out;
53 	ret = -EINVAL;
54 	if (memchr_inv(p, 0, size))
55 		goto out;
56 
57 	p->last_op = IORING_OP_LAST - 1;
58 
59 	for (i = 0; i < nr_args; i++) {
60 		p->ops[i].op = i;
61 		if (io_uring_op_supported(i))
62 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
63 	}
64 	p->ops_len = i;
65 
66 	ret = 0;
67 	if (copy_to_user(arg, p, size))
68 		ret = -EFAULT;
69 out:
70 	kfree(p);
71 	return ret;
72 }
73 
74 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
75 {
76 	const struct cred *creds;
77 
78 	creds = xa_erase(&ctx->personalities, id);
79 	if (creds) {
80 		put_cred(creds);
81 		return 0;
82 	}
83 
84 	return -EINVAL;
85 }
86 
87 
88 static int io_register_personality(struct io_ring_ctx *ctx)
89 {
90 	const struct cred *creds;
91 	u32 id;
92 	int ret;
93 
94 	creds = get_current_cred();
95 
96 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
97 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
98 	if (ret < 0) {
99 		put_cred(creds);
100 		return ret;
101 	}
102 	return id;
103 }
104 
105 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
106 					   void __user *arg, unsigned int nr_args)
107 {
108 	struct io_uring_restriction *res;
109 	size_t size;
110 	int i, ret;
111 
112 	/* Restrictions allowed only if rings started disabled */
113 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
114 		return -EBADFD;
115 
116 	/* We allow only a single restrictions registration */
117 	if (ctx->restrictions.registered)
118 		return -EBUSY;
119 
120 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
121 		return -EINVAL;
122 
123 	size = array_size(nr_args, sizeof(*res));
124 	if (size == SIZE_MAX)
125 		return -EOVERFLOW;
126 
127 	res = memdup_user(arg, size);
128 	if (IS_ERR(res))
129 		return PTR_ERR(res);
130 
131 	ret = 0;
132 
133 	for (i = 0; i < nr_args; i++) {
134 		switch (res[i].opcode) {
135 		case IORING_RESTRICTION_REGISTER_OP:
136 			if (res[i].register_op >= IORING_REGISTER_LAST) {
137 				ret = -EINVAL;
138 				goto out;
139 			}
140 
141 			__set_bit(res[i].register_op,
142 				  ctx->restrictions.register_op);
143 			break;
144 		case IORING_RESTRICTION_SQE_OP:
145 			if (res[i].sqe_op >= IORING_OP_LAST) {
146 				ret = -EINVAL;
147 				goto out;
148 			}
149 
150 			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
151 			break;
152 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
153 			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
154 			break;
155 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
156 			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
157 			break;
158 		default:
159 			ret = -EINVAL;
160 			goto out;
161 		}
162 	}
163 
164 out:
165 	/* Reset all restrictions if an error happened */
166 	if (ret != 0)
167 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
168 	else
169 		ctx->restrictions.registered = true;
170 
171 	kfree(res);
172 	return ret;
173 }
174 
175 static int io_register_enable_rings(struct io_ring_ctx *ctx)
176 {
177 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
178 		return -EBADFD;
179 
180 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
181 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
182 		/*
183 		 * Lazy activation attempts would fail if it was polled before
184 		 * submitter_task is set.
185 		 */
186 		if (wq_has_sleeper(&ctx->poll_wq))
187 			io_activate_pollwq(ctx);
188 	}
189 
190 	if (ctx->restrictions.registered)
191 		ctx->restricted = 1;
192 
193 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
194 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
195 		wake_up(&ctx->sq_data->wait);
196 	return 0;
197 }
198 
199 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
200 					 cpumask_var_t new_mask)
201 {
202 	int ret;
203 
204 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
205 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
206 	} else {
207 		mutex_unlock(&ctx->uring_lock);
208 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
209 		mutex_lock(&ctx->uring_lock);
210 	}
211 
212 	return ret;
213 }
214 
215 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
216 				       void __user *arg, unsigned len)
217 {
218 	cpumask_var_t new_mask;
219 	int ret;
220 
221 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
222 		return -ENOMEM;
223 
224 	cpumask_clear(new_mask);
225 	if (len > cpumask_size())
226 		len = cpumask_size();
227 
228 #ifdef CONFIG_COMPAT
229 	if (in_compat_syscall())
230 		ret = compat_get_bitmap(cpumask_bits(new_mask),
231 					(const compat_ulong_t __user *)arg,
232 					len * 8 /* CHAR_BIT */);
233 	else
234 #endif
235 		ret = copy_from_user(new_mask, arg, len);
236 
237 	if (ret) {
238 		free_cpumask_var(new_mask);
239 		return -EFAULT;
240 	}
241 
242 	ret = __io_register_iowq_aff(ctx, new_mask);
243 	free_cpumask_var(new_mask);
244 	return ret;
245 }
246 
247 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
248 {
249 	return __io_register_iowq_aff(ctx, NULL);
250 }
251 
252 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
253 					       void __user *arg)
254 	__must_hold(&ctx->uring_lock)
255 {
256 	struct io_tctx_node *node;
257 	struct io_uring_task *tctx = NULL;
258 	struct io_sq_data *sqd = NULL;
259 	__u32 new_count[2];
260 	int i, ret;
261 
262 	if (copy_from_user(new_count, arg, sizeof(new_count)))
263 		return -EFAULT;
264 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
265 		if (new_count[i] > INT_MAX)
266 			return -EINVAL;
267 
268 	if (ctx->flags & IORING_SETUP_SQPOLL) {
269 		sqd = ctx->sq_data;
270 		if (sqd) {
271 			/*
272 			 * Observe the correct sqd->lock -> ctx->uring_lock
273 			 * ordering. Fine to drop uring_lock here, we hold
274 			 * a ref to the ctx.
275 			 */
276 			refcount_inc(&sqd->refs);
277 			mutex_unlock(&ctx->uring_lock);
278 			mutex_lock(&sqd->lock);
279 			mutex_lock(&ctx->uring_lock);
280 			if (sqd->thread)
281 				tctx = sqd->thread->io_uring;
282 		}
283 	} else {
284 		tctx = current->io_uring;
285 	}
286 
287 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
288 
289 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
290 		if (new_count[i])
291 			ctx->iowq_limits[i] = new_count[i];
292 	ctx->iowq_limits_set = true;
293 
294 	if (tctx && tctx->io_wq) {
295 		ret = io_wq_max_workers(tctx->io_wq, new_count);
296 		if (ret)
297 			goto err;
298 	} else {
299 		memset(new_count, 0, sizeof(new_count));
300 	}
301 
302 	if (sqd) {
303 		mutex_unlock(&ctx->uring_lock);
304 		mutex_unlock(&sqd->lock);
305 		io_put_sq_data(sqd);
306 		mutex_lock(&ctx->uring_lock);
307 	}
308 
309 	if (copy_to_user(arg, new_count, sizeof(new_count)))
310 		return -EFAULT;
311 
312 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
313 	if (sqd)
314 		return 0;
315 
316 	/* now propagate the restriction to all registered users */
317 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
318 		tctx = node->task->io_uring;
319 		if (WARN_ON_ONCE(!tctx->io_wq))
320 			continue;
321 
322 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
323 			new_count[i] = ctx->iowq_limits[i];
324 		/* ignore errors, it always returns zero anyway */
325 		(void)io_wq_max_workers(tctx->io_wq, new_count);
326 	}
327 	return 0;
328 err:
329 	if (sqd) {
330 		mutex_unlock(&ctx->uring_lock);
331 		mutex_unlock(&sqd->lock);
332 		io_put_sq_data(sqd);
333 		mutex_lock(&ctx->uring_lock);
334 	}
335 	return ret;
336 }
337 
338 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
339 			       void __user *arg, unsigned nr_args)
340 	__releases(ctx->uring_lock)
341 	__acquires(ctx->uring_lock)
342 {
343 	int ret;
344 
345 	/*
346 	 * We don't quiesce the refs for register anymore and so it can't be
347 	 * dying as we're holding a file ref here.
348 	 */
349 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
350 		return -ENXIO;
351 
352 	if (ctx->submitter_task && ctx->submitter_task != current)
353 		return -EEXIST;
354 
355 	if (ctx->restricted) {
356 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
357 		if (!test_bit(opcode, ctx->restrictions.register_op))
358 			return -EACCES;
359 	}
360 
361 	switch (opcode) {
362 	case IORING_REGISTER_BUFFERS:
363 		ret = -EFAULT;
364 		if (!arg)
365 			break;
366 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
367 		break;
368 	case IORING_UNREGISTER_BUFFERS:
369 		ret = -EINVAL;
370 		if (arg || nr_args)
371 			break;
372 		ret = io_sqe_buffers_unregister(ctx);
373 		break;
374 	case IORING_REGISTER_FILES:
375 		ret = -EFAULT;
376 		if (!arg)
377 			break;
378 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
379 		break;
380 	case IORING_UNREGISTER_FILES:
381 		ret = -EINVAL;
382 		if (arg || nr_args)
383 			break;
384 		ret = io_sqe_files_unregister(ctx);
385 		break;
386 	case IORING_REGISTER_FILES_UPDATE:
387 		ret = io_register_files_update(ctx, arg, nr_args);
388 		break;
389 	case IORING_REGISTER_EVENTFD:
390 		ret = -EINVAL;
391 		if (nr_args != 1)
392 			break;
393 		ret = io_eventfd_register(ctx, arg, 0);
394 		break;
395 	case IORING_REGISTER_EVENTFD_ASYNC:
396 		ret = -EINVAL;
397 		if (nr_args != 1)
398 			break;
399 		ret = io_eventfd_register(ctx, arg, 1);
400 		break;
401 	case IORING_UNREGISTER_EVENTFD:
402 		ret = -EINVAL;
403 		if (arg || nr_args)
404 			break;
405 		ret = io_eventfd_unregister(ctx);
406 		break;
407 	case IORING_REGISTER_PROBE:
408 		ret = -EINVAL;
409 		if (!arg || nr_args > 256)
410 			break;
411 		ret = io_probe(ctx, arg, nr_args);
412 		break;
413 	case IORING_REGISTER_PERSONALITY:
414 		ret = -EINVAL;
415 		if (arg || nr_args)
416 			break;
417 		ret = io_register_personality(ctx);
418 		break;
419 	case IORING_UNREGISTER_PERSONALITY:
420 		ret = -EINVAL;
421 		if (arg)
422 			break;
423 		ret = io_unregister_personality(ctx, nr_args);
424 		break;
425 	case IORING_REGISTER_ENABLE_RINGS:
426 		ret = -EINVAL;
427 		if (arg || nr_args)
428 			break;
429 		ret = io_register_enable_rings(ctx);
430 		break;
431 	case IORING_REGISTER_RESTRICTIONS:
432 		ret = io_register_restrictions(ctx, arg, nr_args);
433 		break;
434 	case IORING_REGISTER_FILES2:
435 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
436 		break;
437 	case IORING_REGISTER_FILES_UPDATE2:
438 		ret = io_register_rsrc_update(ctx, arg, nr_args,
439 					      IORING_RSRC_FILE);
440 		break;
441 	case IORING_REGISTER_BUFFERS2:
442 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
443 		break;
444 	case IORING_REGISTER_BUFFERS_UPDATE:
445 		ret = io_register_rsrc_update(ctx, arg, nr_args,
446 					      IORING_RSRC_BUFFER);
447 		break;
448 	case IORING_REGISTER_IOWQ_AFF:
449 		ret = -EINVAL;
450 		if (!arg || !nr_args)
451 			break;
452 		ret = io_register_iowq_aff(ctx, arg, nr_args);
453 		break;
454 	case IORING_UNREGISTER_IOWQ_AFF:
455 		ret = -EINVAL;
456 		if (arg || nr_args)
457 			break;
458 		ret = io_unregister_iowq_aff(ctx);
459 		break;
460 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
461 		ret = -EINVAL;
462 		if (!arg || nr_args != 2)
463 			break;
464 		ret = io_register_iowq_max_workers(ctx, arg);
465 		break;
466 	case IORING_REGISTER_RING_FDS:
467 		ret = io_ringfd_register(ctx, arg, nr_args);
468 		break;
469 	case IORING_UNREGISTER_RING_FDS:
470 		ret = io_ringfd_unregister(ctx, arg, nr_args);
471 		break;
472 	case IORING_REGISTER_PBUF_RING:
473 		ret = -EINVAL;
474 		if (!arg || nr_args != 1)
475 			break;
476 		ret = io_register_pbuf_ring(ctx, arg);
477 		break;
478 	case IORING_UNREGISTER_PBUF_RING:
479 		ret = -EINVAL;
480 		if (!arg || nr_args != 1)
481 			break;
482 		ret = io_unregister_pbuf_ring(ctx, arg);
483 		break;
484 	case IORING_REGISTER_SYNC_CANCEL:
485 		ret = -EINVAL;
486 		if (!arg || nr_args != 1)
487 			break;
488 		ret = io_sync_cancel(ctx, arg);
489 		break;
490 	case IORING_REGISTER_FILE_ALLOC_RANGE:
491 		ret = -EINVAL;
492 		if (!arg || nr_args)
493 			break;
494 		ret = io_register_file_alloc_range(ctx, arg);
495 		break;
496 	case IORING_REGISTER_PBUF_STATUS:
497 		ret = -EINVAL;
498 		if (!arg || nr_args != 1)
499 			break;
500 		ret = io_register_pbuf_status(ctx, arg);
501 		break;
502 	case IORING_REGISTER_NAPI:
503 		ret = -EINVAL;
504 		if (!arg || nr_args != 1)
505 			break;
506 		ret = io_register_napi(ctx, arg);
507 		break;
508 	case IORING_UNREGISTER_NAPI:
509 		ret = -EINVAL;
510 		if (nr_args != 1)
511 			break;
512 		ret = io_unregister_napi(ctx, arg);
513 		break;
514 	default:
515 		ret = -EINVAL;
516 		break;
517 	}
518 
519 	return ret;
520 }
521 
522 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
523 		void __user *, arg, unsigned int, nr_args)
524 {
525 	struct io_ring_ctx *ctx;
526 	long ret = -EBADF;
527 	struct file *file;
528 	bool use_registered_ring;
529 
530 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
531 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
532 
533 	if (opcode >= IORING_REGISTER_LAST)
534 		return -EINVAL;
535 
536 	if (use_registered_ring) {
537 		/*
538 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
539 		 * need only dereference our task private array to find it.
540 		 */
541 		struct io_uring_task *tctx = current->io_uring;
542 
543 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
544 			return -EINVAL;
545 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
546 		file = tctx->registered_rings[fd];
547 		if (unlikely(!file))
548 			return -EBADF;
549 	} else {
550 		file = fget(fd);
551 		if (unlikely(!file))
552 			return -EBADF;
553 		ret = -EOPNOTSUPP;
554 		if (!io_is_uring_fops(file))
555 			goto out_fput;
556 	}
557 
558 	ctx = file->private_data;
559 
560 	mutex_lock(&ctx->uring_lock);
561 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
562 	mutex_unlock(&ctx->uring_lock);
563 	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
564 out_fput:
565 	if (!use_registered_ring)
566 		fput(file);
567 	return ret;
568 }
569