xref: /linux/io_uring/register.c (revision e6a901a00822659181c93c86d8bbc2a17779fddc)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30 
31 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
32 				 IORING_REGISTER_LAST + IORING_OP_LAST)
33 
34 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
35 			       unsigned int eventfd_async)
36 {
37 	struct io_ev_fd *ev_fd;
38 	__s32 __user *fds = arg;
39 	int fd;
40 
41 	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
42 					lockdep_is_held(&ctx->uring_lock));
43 	if (ev_fd)
44 		return -EBUSY;
45 
46 	if (copy_from_user(&fd, fds, sizeof(*fds)))
47 		return -EFAULT;
48 
49 	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
50 	if (!ev_fd)
51 		return -ENOMEM;
52 
53 	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
54 	if (IS_ERR(ev_fd->cq_ev_fd)) {
55 		int ret = PTR_ERR(ev_fd->cq_ev_fd);
56 		kfree(ev_fd);
57 		return ret;
58 	}
59 
60 	spin_lock(&ctx->completion_lock);
61 	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
62 	spin_unlock(&ctx->completion_lock);
63 
64 	ev_fd->eventfd_async = eventfd_async;
65 	ctx->has_evfd = true;
66 	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
67 	atomic_set(&ev_fd->refs, 1);
68 	atomic_set(&ev_fd->ops, 0);
69 	return 0;
70 }
71 
72 int io_eventfd_unregister(struct io_ring_ctx *ctx)
73 {
74 	struct io_ev_fd *ev_fd;
75 
76 	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
77 					lockdep_is_held(&ctx->uring_lock));
78 	if (ev_fd) {
79 		ctx->has_evfd = false;
80 		rcu_assign_pointer(ctx->io_ev_fd, NULL);
81 		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
82 			call_rcu(&ev_fd->rcu, io_eventfd_ops);
83 		return 0;
84 	}
85 
86 	return -ENXIO;
87 }
88 
89 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
90 			   unsigned nr_args)
91 {
92 	struct io_uring_probe *p;
93 	size_t size;
94 	int i, ret;
95 
96 	size = struct_size(p, ops, nr_args);
97 	if (size == SIZE_MAX)
98 		return -EOVERFLOW;
99 	p = kzalloc(size, GFP_KERNEL);
100 	if (!p)
101 		return -ENOMEM;
102 
103 	ret = -EFAULT;
104 	if (copy_from_user(p, arg, size))
105 		goto out;
106 	ret = -EINVAL;
107 	if (memchr_inv(p, 0, size))
108 		goto out;
109 
110 	p->last_op = IORING_OP_LAST - 1;
111 	if (nr_args > IORING_OP_LAST)
112 		nr_args = IORING_OP_LAST;
113 
114 	for (i = 0; i < nr_args; i++) {
115 		p->ops[i].op = i;
116 		if (!io_issue_defs[i].not_supported)
117 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
118 	}
119 	p->ops_len = i;
120 
121 	ret = 0;
122 	if (copy_to_user(arg, p, size))
123 		ret = -EFAULT;
124 out:
125 	kfree(p);
126 	return ret;
127 }
128 
129 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
130 {
131 	const struct cred *creds;
132 
133 	creds = xa_erase(&ctx->personalities, id);
134 	if (creds) {
135 		put_cred(creds);
136 		return 0;
137 	}
138 
139 	return -EINVAL;
140 }
141 
142 
143 static int io_register_personality(struct io_ring_ctx *ctx)
144 {
145 	const struct cred *creds;
146 	u32 id;
147 	int ret;
148 
149 	creds = get_current_cred();
150 
151 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
152 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
153 	if (ret < 0) {
154 		put_cred(creds);
155 		return ret;
156 	}
157 	return id;
158 }
159 
160 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
161 					   void __user *arg, unsigned int nr_args)
162 {
163 	struct io_uring_restriction *res;
164 	size_t size;
165 	int i, ret;
166 
167 	/* Restrictions allowed only if rings started disabled */
168 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
169 		return -EBADFD;
170 
171 	/* We allow only a single restrictions registration */
172 	if (ctx->restrictions.registered)
173 		return -EBUSY;
174 
175 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
176 		return -EINVAL;
177 
178 	size = array_size(nr_args, sizeof(*res));
179 	if (size == SIZE_MAX)
180 		return -EOVERFLOW;
181 
182 	res = memdup_user(arg, size);
183 	if (IS_ERR(res))
184 		return PTR_ERR(res);
185 
186 	ret = 0;
187 
188 	for (i = 0; i < nr_args; i++) {
189 		switch (res[i].opcode) {
190 		case IORING_RESTRICTION_REGISTER_OP:
191 			if (res[i].register_op >= IORING_REGISTER_LAST) {
192 				ret = -EINVAL;
193 				goto out;
194 			}
195 
196 			__set_bit(res[i].register_op,
197 				  ctx->restrictions.register_op);
198 			break;
199 		case IORING_RESTRICTION_SQE_OP:
200 			if (res[i].sqe_op >= IORING_OP_LAST) {
201 				ret = -EINVAL;
202 				goto out;
203 			}
204 
205 			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
206 			break;
207 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
208 			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
209 			break;
210 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
211 			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
212 			break;
213 		default:
214 			ret = -EINVAL;
215 			goto out;
216 		}
217 	}
218 
219 out:
220 	/* Reset all restrictions if an error happened */
221 	if (ret != 0)
222 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
223 	else
224 		ctx->restrictions.registered = true;
225 
226 	kfree(res);
227 	return ret;
228 }
229 
230 static int io_register_enable_rings(struct io_ring_ctx *ctx)
231 {
232 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
233 		return -EBADFD;
234 
235 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
236 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
237 		/*
238 		 * Lazy activation attempts would fail if it was polled before
239 		 * submitter_task is set.
240 		 */
241 		if (wq_has_sleeper(&ctx->poll_wq))
242 			io_activate_pollwq(ctx);
243 	}
244 
245 	if (ctx->restrictions.registered)
246 		ctx->restricted = 1;
247 
248 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
249 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
250 		wake_up(&ctx->sq_data->wait);
251 	return 0;
252 }
253 
254 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
255 					 cpumask_var_t new_mask)
256 {
257 	int ret;
258 
259 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
260 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
261 	} else {
262 		mutex_unlock(&ctx->uring_lock);
263 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
264 		mutex_lock(&ctx->uring_lock);
265 	}
266 
267 	return ret;
268 }
269 
270 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
271 				       void __user *arg, unsigned len)
272 {
273 	cpumask_var_t new_mask;
274 	int ret;
275 
276 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
277 		return -ENOMEM;
278 
279 	cpumask_clear(new_mask);
280 	if (len > cpumask_size())
281 		len = cpumask_size();
282 
283 #ifdef CONFIG_COMPAT
284 	if (in_compat_syscall())
285 		ret = compat_get_bitmap(cpumask_bits(new_mask),
286 					(const compat_ulong_t __user *)arg,
287 					len * 8 /* CHAR_BIT */);
288 	else
289 #endif
290 		ret = copy_from_user(new_mask, arg, len);
291 
292 	if (ret) {
293 		free_cpumask_var(new_mask);
294 		return -EFAULT;
295 	}
296 
297 	ret = __io_register_iowq_aff(ctx, new_mask);
298 	free_cpumask_var(new_mask);
299 	return ret;
300 }
301 
302 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
303 {
304 	return __io_register_iowq_aff(ctx, NULL);
305 }
306 
307 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
308 					       void __user *arg)
309 	__must_hold(&ctx->uring_lock)
310 {
311 	struct io_tctx_node *node;
312 	struct io_uring_task *tctx = NULL;
313 	struct io_sq_data *sqd = NULL;
314 	__u32 new_count[2];
315 	int i, ret;
316 
317 	if (copy_from_user(new_count, arg, sizeof(new_count)))
318 		return -EFAULT;
319 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
320 		if (new_count[i] > INT_MAX)
321 			return -EINVAL;
322 
323 	if (ctx->flags & IORING_SETUP_SQPOLL) {
324 		sqd = ctx->sq_data;
325 		if (sqd) {
326 			/*
327 			 * Observe the correct sqd->lock -> ctx->uring_lock
328 			 * ordering. Fine to drop uring_lock here, we hold
329 			 * a ref to the ctx.
330 			 */
331 			refcount_inc(&sqd->refs);
332 			mutex_unlock(&ctx->uring_lock);
333 			mutex_lock(&sqd->lock);
334 			mutex_lock(&ctx->uring_lock);
335 			if (sqd->thread)
336 				tctx = sqd->thread->io_uring;
337 		}
338 	} else {
339 		tctx = current->io_uring;
340 	}
341 
342 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
343 
344 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
345 		if (new_count[i])
346 			ctx->iowq_limits[i] = new_count[i];
347 	ctx->iowq_limits_set = true;
348 
349 	if (tctx && tctx->io_wq) {
350 		ret = io_wq_max_workers(tctx->io_wq, new_count);
351 		if (ret)
352 			goto err;
353 	} else {
354 		memset(new_count, 0, sizeof(new_count));
355 	}
356 
357 	if (sqd) {
358 		mutex_unlock(&sqd->lock);
359 		io_put_sq_data(sqd);
360 	}
361 
362 	if (copy_to_user(arg, new_count, sizeof(new_count)))
363 		return -EFAULT;
364 
365 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
366 	if (sqd)
367 		return 0;
368 
369 	/* now propagate the restriction to all registered users */
370 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
371 		struct io_uring_task *tctx = node->task->io_uring;
372 
373 		if (WARN_ON_ONCE(!tctx->io_wq))
374 			continue;
375 
376 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
377 			new_count[i] = ctx->iowq_limits[i];
378 		/* ignore errors, it always returns zero anyway */
379 		(void)io_wq_max_workers(tctx->io_wq, new_count);
380 	}
381 	return 0;
382 err:
383 	if (sqd) {
384 		mutex_unlock(&sqd->lock);
385 		io_put_sq_data(sqd);
386 	}
387 	return ret;
388 }
389 
390 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
391 			       void __user *arg, unsigned nr_args)
392 	__releases(ctx->uring_lock)
393 	__acquires(ctx->uring_lock)
394 {
395 	int ret;
396 
397 	/*
398 	 * We don't quiesce the refs for register anymore and so it can't be
399 	 * dying as we're holding a file ref here.
400 	 */
401 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
402 		return -ENXIO;
403 
404 	if (ctx->submitter_task && ctx->submitter_task != current)
405 		return -EEXIST;
406 
407 	if (ctx->restricted) {
408 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
409 		if (!test_bit(opcode, ctx->restrictions.register_op))
410 			return -EACCES;
411 	}
412 
413 	switch (opcode) {
414 	case IORING_REGISTER_BUFFERS:
415 		ret = -EFAULT;
416 		if (!arg)
417 			break;
418 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
419 		break;
420 	case IORING_UNREGISTER_BUFFERS:
421 		ret = -EINVAL;
422 		if (arg || nr_args)
423 			break;
424 		ret = io_sqe_buffers_unregister(ctx);
425 		break;
426 	case IORING_REGISTER_FILES:
427 		ret = -EFAULT;
428 		if (!arg)
429 			break;
430 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
431 		break;
432 	case IORING_UNREGISTER_FILES:
433 		ret = -EINVAL;
434 		if (arg || nr_args)
435 			break;
436 		ret = io_sqe_files_unregister(ctx);
437 		break;
438 	case IORING_REGISTER_FILES_UPDATE:
439 		ret = io_register_files_update(ctx, arg, nr_args);
440 		break;
441 	case IORING_REGISTER_EVENTFD:
442 		ret = -EINVAL;
443 		if (nr_args != 1)
444 			break;
445 		ret = io_eventfd_register(ctx, arg, 0);
446 		break;
447 	case IORING_REGISTER_EVENTFD_ASYNC:
448 		ret = -EINVAL;
449 		if (nr_args != 1)
450 			break;
451 		ret = io_eventfd_register(ctx, arg, 1);
452 		break;
453 	case IORING_UNREGISTER_EVENTFD:
454 		ret = -EINVAL;
455 		if (arg || nr_args)
456 			break;
457 		ret = io_eventfd_unregister(ctx);
458 		break;
459 	case IORING_REGISTER_PROBE:
460 		ret = -EINVAL;
461 		if (!arg || nr_args > 256)
462 			break;
463 		ret = io_probe(ctx, arg, nr_args);
464 		break;
465 	case IORING_REGISTER_PERSONALITY:
466 		ret = -EINVAL;
467 		if (arg || nr_args)
468 			break;
469 		ret = io_register_personality(ctx);
470 		break;
471 	case IORING_UNREGISTER_PERSONALITY:
472 		ret = -EINVAL;
473 		if (arg)
474 			break;
475 		ret = io_unregister_personality(ctx, nr_args);
476 		break;
477 	case IORING_REGISTER_ENABLE_RINGS:
478 		ret = -EINVAL;
479 		if (arg || nr_args)
480 			break;
481 		ret = io_register_enable_rings(ctx);
482 		break;
483 	case IORING_REGISTER_RESTRICTIONS:
484 		ret = io_register_restrictions(ctx, arg, nr_args);
485 		break;
486 	case IORING_REGISTER_FILES2:
487 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
488 		break;
489 	case IORING_REGISTER_FILES_UPDATE2:
490 		ret = io_register_rsrc_update(ctx, arg, nr_args,
491 					      IORING_RSRC_FILE);
492 		break;
493 	case IORING_REGISTER_BUFFERS2:
494 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
495 		break;
496 	case IORING_REGISTER_BUFFERS_UPDATE:
497 		ret = io_register_rsrc_update(ctx, arg, nr_args,
498 					      IORING_RSRC_BUFFER);
499 		break;
500 	case IORING_REGISTER_IOWQ_AFF:
501 		ret = -EINVAL;
502 		if (!arg || !nr_args)
503 			break;
504 		ret = io_register_iowq_aff(ctx, arg, nr_args);
505 		break;
506 	case IORING_UNREGISTER_IOWQ_AFF:
507 		ret = -EINVAL;
508 		if (arg || nr_args)
509 			break;
510 		ret = io_unregister_iowq_aff(ctx);
511 		break;
512 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
513 		ret = -EINVAL;
514 		if (!arg || nr_args != 2)
515 			break;
516 		ret = io_register_iowq_max_workers(ctx, arg);
517 		break;
518 	case IORING_REGISTER_RING_FDS:
519 		ret = io_ringfd_register(ctx, arg, nr_args);
520 		break;
521 	case IORING_UNREGISTER_RING_FDS:
522 		ret = io_ringfd_unregister(ctx, arg, nr_args);
523 		break;
524 	case IORING_REGISTER_PBUF_RING:
525 		ret = -EINVAL;
526 		if (!arg || nr_args != 1)
527 			break;
528 		ret = io_register_pbuf_ring(ctx, arg);
529 		break;
530 	case IORING_UNREGISTER_PBUF_RING:
531 		ret = -EINVAL;
532 		if (!arg || nr_args != 1)
533 			break;
534 		ret = io_unregister_pbuf_ring(ctx, arg);
535 		break;
536 	case IORING_REGISTER_SYNC_CANCEL:
537 		ret = -EINVAL;
538 		if (!arg || nr_args != 1)
539 			break;
540 		ret = io_sync_cancel(ctx, arg);
541 		break;
542 	case IORING_REGISTER_FILE_ALLOC_RANGE:
543 		ret = -EINVAL;
544 		if (!arg || nr_args)
545 			break;
546 		ret = io_register_file_alloc_range(ctx, arg);
547 		break;
548 	case IORING_REGISTER_PBUF_STATUS:
549 		ret = -EINVAL;
550 		if (!arg || nr_args != 1)
551 			break;
552 		ret = io_register_pbuf_status(ctx, arg);
553 		break;
554 	case IORING_REGISTER_NAPI:
555 		ret = -EINVAL;
556 		if (!arg || nr_args != 1)
557 			break;
558 		ret = io_register_napi(ctx, arg);
559 		break;
560 	case IORING_UNREGISTER_NAPI:
561 		ret = -EINVAL;
562 		if (nr_args != 1)
563 			break;
564 		ret = io_unregister_napi(ctx, arg);
565 		break;
566 	default:
567 		ret = -EINVAL;
568 		break;
569 	}
570 
571 	return ret;
572 }
573 
574 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
575 		void __user *, arg, unsigned int, nr_args)
576 {
577 	struct io_ring_ctx *ctx;
578 	long ret = -EBADF;
579 	struct file *file;
580 	bool use_registered_ring;
581 
582 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
583 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
584 
585 	if (opcode >= IORING_REGISTER_LAST)
586 		return -EINVAL;
587 
588 	if (use_registered_ring) {
589 		/*
590 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
591 		 * need only dereference our task private array to find it.
592 		 */
593 		struct io_uring_task *tctx = current->io_uring;
594 
595 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
596 			return -EINVAL;
597 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
598 		file = tctx->registered_rings[fd];
599 		if (unlikely(!file))
600 			return -EBADF;
601 	} else {
602 		file = fget(fd);
603 		if (unlikely(!file))
604 			return -EBADF;
605 		ret = -EOPNOTSUPP;
606 		if (!io_is_uring_fops(file))
607 			goto out_fput;
608 	}
609 
610 	ctx = file->private_data;
611 
612 	mutex_lock(&ctx->uring_lock);
613 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
614 	mutex_unlock(&ctx->uring_lock);
615 	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
616 out_fput:
617 	if (!use_registered_ring)
618 		fput(file);
619 	return ret;
620 }
621