xref: /linux/io_uring/register.c (revision 5b9b41617bf3e1282cc60f07d3d52e62399aa4ba)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/io_uring.h>
18 #include <linux/io_uring_types.h>
19 
20 #include "io_uring.h"
21 #include "opdef.h"
22 #include "tctx.h"
23 #include "rsrc.h"
24 #include "sqpoll.h"
25 #include "register.h"
26 #include "cancel.h"
27 #include "kbuf.h"
28 
29 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
30 				 IORING_REGISTER_LAST + IORING_OP_LAST)
31 
32 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
33 			       unsigned int eventfd_async)
34 {
35 	struct io_ev_fd *ev_fd;
36 	__s32 __user *fds = arg;
37 	int fd;
38 
39 	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
40 					lockdep_is_held(&ctx->uring_lock));
41 	if (ev_fd)
42 		return -EBUSY;
43 
44 	if (copy_from_user(&fd, fds, sizeof(*fds)))
45 		return -EFAULT;
46 
47 	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
48 	if (!ev_fd)
49 		return -ENOMEM;
50 
51 	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
52 	if (IS_ERR(ev_fd->cq_ev_fd)) {
53 		int ret = PTR_ERR(ev_fd->cq_ev_fd);
54 		kfree(ev_fd);
55 		return ret;
56 	}
57 
58 	spin_lock(&ctx->completion_lock);
59 	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
60 	spin_unlock(&ctx->completion_lock);
61 
62 	ev_fd->eventfd_async = eventfd_async;
63 	ctx->has_evfd = true;
64 	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
65 	atomic_set(&ev_fd->refs, 1);
66 	atomic_set(&ev_fd->ops, 0);
67 	return 0;
68 }
69 
70 int io_eventfd_unregister(struct io_ring_ctx *ctx)
71 {
72 	struct io_ev_fd *ev_fd;
73 
74 	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
75 					lockdep_is_held(&ctx->uring_lock));
76 	if (ev_fd) {
77 		ctx->has_evfd = false;
78 		rcu_assign_pointer(ctx->io_ev_fd, NULL);
79 		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
80 			call_rcu(&ev_fd->rcu, io_eventfd_ops);
81 		return 0;
82 	}
83 
84 	return -ENXIO;
85 }
86 
87 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
88 			   unsigned nr_args)
89 {
90 	struct io_uring_probe *p;
91 	size_t size;
92 	int i, ret;
93 
94 	size = struct_size(p, ops, nr_args);
95 	if (size == SIZE_MAX)
96 		return -EOVERFLOW;
97 	p = kzalloc(size, GFP_KERNEL);
98 	if (!p)
99 		return -ENOMEM;
100 
101 	ret = -EFAULT;
102 	if (copy_from_user(p, arg, size))
103 		goto out;
104 	ret = -EINVAL;
105 	if (memchr_inv(p, 0, size))
106 		goto out;
107 
108 	p->last_op = IORING_OP_LAST - 1;
109 	if (nr_args > IORING_OP_LAST)
110 		nr_args = IORING_OP_LAST;
111 
112 	for (i = 0; i < nr_args; i++) {
113 		p->ops[i].op = i;
114 		if (!io_issue_defs[i].not_supported)
115 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
116 	}
117 	p->ops_len = i;
118 
119 	ret = 0;
120 	if (copy_to_user(arg, p, size))
121 		ret = -EFAULT;
122 out:
123 	kfree(p);
124 	return ret;
125 }
126 
127 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
128 {
129 	const struct cred *creds;
130 
131 	creds = xa_erase(&ctx->personalities, id);
132 	if (creds) {
133 		put_cred(creds);
134 		return 0;
135 	}
136 
137 	return -EINVAL;
138 }
139 
140 
141 static int io_register_personality(struct io_ring_ctx *ctx)
142 {
143 	const struct cred *creds;
144 	u32 id;
145 	int ret;
146 
147 	creds = get_current_cred();
148 
149 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
150 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
151 	if (ret < 0) {
152 		put_cred(creds);
153 		return ret;
154 	}
155 	return id;
156 }
157 
158 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
159 					   void __user *arg, unsigned int nr_args)
160 {
161 	struct io_uring_restriction *res;
162 	size_t size;
163 	int i, ret;
164 
165 	/* Restrictions allowed only if rings started disabled */
166 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
167 		return -EBADFD;
168 
169 	/* We allow only a single restrictions registration */
170 	if (ctx->restrictions.registered)
171 		return -EBUSY;
172 
173 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
174 		return -EINVAL;
175 
176 	size = array_size(nr_args, sizeof(*res));
177 	if (size == SIZE_MAX)
178 		return -EOVERFLOW;
179 
180 	res = memdup_user(arg, size);
181 	if (IS_ERR(res))
182 		return PTR_ERR(res);
183 
184 	ret = 0;
185 
186 	for (i = 0; i < nr_args; i++) {
187 		switch (res[i].opcode) {
188 		case IORING_RESTRICTION_REGISTER_OP:
189 			if (res[i].register_op >= IORING_REGISTER_LAST) {
190 				ret = -EINVAL;
191 				goto out;
192 			}
193 
194 			__set_bit(res[i].register_op,
195 				  ctx->restrictions.register_op);
196 			break;
197 		case IORING_RESTRICTION_SQE_OP:
198 			if (res[i].sqe_op >= IORING_OP_LAST) {
199 				ret = -EINVAL;
200 				goto out;
201 			}
202 
203 			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
204 			break;
205 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
206 			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
207 			break;
208 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
209 			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
210 			break;
211 		default:
212 			ret = -EINVAL;
213 			goto out;
214 		}
215 	}
216 
217 out:
218 	/* Reset all restrictions if an error happened */
219 	if (ret != 0)
220 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
221 	else
222 		ctx->restrictions.registered = true;
223 
224 	kfree(res);
225 	return ret;
226 }
227 
228 static int io_register_enable_rings(struct io_ring_ctx *ctx)
229 {
230 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
231 		return -EBADFD;
232 
233 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
234 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
235 		/*
236 		 * Lazy activation attempts would fail if it was polled before
237 		 * submitter_task is set.
238 		 */
239 		if (wq_has_sleeper(&ctx->poll_wq))
240 			io_activate_pollwq(ctx);
241 	}
242 
243 	if (ctx->restrictions.registered)
244 		ctx->restricted = 1;
245 
246 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
247 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
248 		wake_up(&ctx->sq_data->wait);
249 	return 0;
250 }
251 
252 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
253 					 cpumask_var_t new_mask)
254 {
255 	int ret;
256 
257 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
258 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
259 	} else {
260 		mutex_unlock(&ctx->uring_lock);
261 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
262 		mutex_lock(&ctx->uring_lock);
263 	}
264 
265 	return ret;
266 }
267 
268 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
269 				       void __user *arg, unsigned len)
270 {
271 	cpumask_var_t new_mask;
272 	int ret;
273 
274 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
275 		return -ENOMEM;
276 
277 	cpumask_clear(new_mask);
278 	if (len > cpumask_size())
279 		len = cpumask_size();
280 
281 	if (in_compat_syscall()) {
282 		ret = compat_get_bitmap(cpumask_bits(new_mask),
283 					(const compat_ulong_t __user *)arg,
284 					len * 8 /* CHAR_BIT */);
285 	} else {
286 		ret = copy_from_user(new_mask, arg, len);
287 	}
288 
289 	if (ret) {
290 		free_cpumask_var(new_mask);
291 		return -EFAULT;
292 	}
293 
294 	ret = __io_register_iowq_aff(ctx, new_mask);
295 	free_cpumask_var(new_mask);
296 	return ret;
297 }
298 
299 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
300 {
301 	return __io_register_iowq_aff(ctx, NULL);
302 }
303 
304 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
305 					       void __user *arg)
306 	__must_hold(&ctx->uring_lock)
307 {
308 	struct io_tctx_node *node;
309 	struct io_uring_task *tctx = NULL;
310 	struct io_sq_data *sqd = NULL;
311 	__u32 new_count[2];
312 	int i, ret;
313 
314 	if (copy_from_user(new_count, arg, sizeof(new_count)))
315 		return -EFAULT;
316 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
317 		if (new_count[i] > INT_MAX)
318 			return -EINVAL;
319 
320 	if (ctx->flags & IORING_SETUP_SQPOLL) {
321 		sqd = ctx->sq_data;
322 		if (sqd) {
323 			/*
324 			 * Observe the correct sqd->lock -> ctx->uring_lock
325 			 * ordering. Fine to drop uring_lock here, we hold
326 			 * a ref to the ctx.
327 			 */
328 			refcount_inc(&sqd->refs);
329 			mutex_unlock(&ctx->uring_lock);
330 			mutex_lock(&sqd->lock);
331 			mutex_lock(&ctx->uring_lock);
332 			if (sqd->thread)
333 				tctx = sqd->thread->io_uring;
334 		}
335 	} else {
336 		tctx = current->io_uring;
337 	}
338 
339 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
340 
341 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
342 		if (new_count[i])
343 			ctx->iowq_limits[i] = new_count[i];
344 	ctx->iowq_limits_set = true;
345 
346 	if (tctx && tctx->io_wq) {
347 		ret = io_wq_max_workers(tctx->io_wq, new_count);
348 		if (ret)
349 			goto err;
350 	} else {
351 		memset(new_count, 0, sizeof(new_count));
352 	}
353 
354 	if (sqd) {
355 		mutex_unlock(&sqd->lock);
356 		io_put_sq_data(sqd);
357 	}
358 
359 	if (copy_to_user(arg, new_count, sizeof(new_count)))
360 		return -EFAULT;
361 
362 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
363 	if (sqd)
364 		return 0;
365 
366 	/* now propagate the restriction to all registered users */
367 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
368 		struct io_uring_task *tctx = node->task->io_uring;
369 
370 		if (WARN_ON_ONCE(!tctx->io_wq))
371 			continue;
372 
373 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
374 			new_count[i] = ctx->iowq_limits[i];
375 		/* ignore errors, it always returns zero anyway */
376 		(void)io_wq_max_workers(tctx->io_wq, new_count);
377 	}
378 	return 0;
379 err:
380 	if (sqd) {
381 		mutex_unlock(&sqd->lock);
382 		io_put_sq_data(sqd);
383 	}
384 	return ret;
385 }
386 
387 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
388 			       void __user *arg, unsigned nr_args)
389 	__releases(ctx->uring_lock)
390 	__acquires(ctx->uring_lock)
391 {
392 	int ret;
393 
394 	/*
395 	 * We don't quiesce the refs for register anymore and so it can't be
396 	 * dying as we're holding a file ref here.
397 	 */
398 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
399 		return -ENXIO;
400 
401 	if (ctx->submitter_task && ctx->submitter_task != current)
402 		return -EEXIST;
403 
404 	if (ctx->restricted) {
405 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
406 		if (!test_bit(opcode, ctx->restrictions.register_op))
407 			return -EACCES;
408 	}
409 
410 	switch (opcode) {
411 	case IORING_REGISTER_BUFFERS:
412 		ret = -EFAULT;
413 		if (!arg)
414 			break;
415 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
416 		break;
417 	case IORING_UNREGISTER_BUFFERS:
418 		ret = -EINVAL;
419 		if (arg || nr_args)
420 			break;
421 		ret = io_sqe_buffers_unregister(ctx);
422 		break;
423 	case IORING_REGISTER_FILES:
424 		ret = -EFAULT;
425 		if (!arg)
426 			break;
427 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
428 		break;
429 	case IORING_UNREGISTER_FILES:
430 		ret = -EINVAL;
431 		if (arg || nr_args)
432 			break;
433 		ret = io_sqe_files_unregister(ctx);
434 		break;
435 	case IORING_REGISTER_FILES_UPDATE:
436 		ret = io_register_files_update(ctx, arg, nr_args);
437 		break;
438 	case IORING_REGISTER_EVENTFD:
439 		ret = -EINVAL;
440 		if (nr_args != 1)
441 			break;
442 		ret = io_eventfd_register(ctx, arg, 0);
443 		break;
444 	case IORING_REGISTER_EVENTFD_ASYNC:
445 		ret = -EINVAL;
446 		if (nr_args != 1)
447 			break;
448 		ret = io_eventfd_register(ctx, arg, 1);
449 		break;
450 	case IORING_UNREGISTER_EVENTFD:
451 		ret = -EINVAL;
452 		if (arg || nr_args)
453 			break;
454 		ret = io_eventfd_unregister(ctx);
455 		break;
456 	case IORING_REGISTER_PROBE:
457 		ret = -EINVAL;
458 		if (!arg || nr_args > 256)
459 			break;
460 		ret = io_probe(ctx, arg, nr_args);
461 		break;
462 	case IORING_REGISTER_PERSONALITY:
463 		ret = -EINVAL;
464 		if (arg || nr_args)
465 			break;
466 		ret = io_register_personality(ctx);
467 		break;
468 	case IORING_UNREGISTER_PERSONALITY:
469 		ret = -EINVAL;
470 		if (arg)
471 			break;
472 		ret = io_unregister_personality(ctx, nr_args);
473 		break;
474 	case IORING_REGISTER_ENABLE_RINGS:
475 		ret = -EINVAL;
476 		if (arg || nr_args)
477 			break;
478 		ret = io_register_enable_rings(ctx);
479 		break;
480 	case IORING_REGISTER_RESTRICTIONS:
481 		ret = io_register_restrictions(ctx, arg, nr_args);
482 		break;
483 	case IORING_REGISTER_FILES2:
484 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
485 		break;
486 	case IORING_REGISTER_FILES_UPDATE2:
487 		ret = io_register_rsrc_update(ctx, arg, nr_args,
488 					      IORING_RSRC_FILE);
489 		break;
490 	case IORING_REGISTER_BUFFERS2:
491 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
492 		break;
493 	case IORING_REGISTER_BUFFERS_UPDATE:
494 		ret = io_register_rsrc_update(ctx, arg, nr_args,
495 					      IORING_RSRC_BUFFER);
496 		break;
497 	case IORING_REGISTER_IOWQ_AFF:
498 		ret = -EINVAL;
499 		if (!arg || !nr_args)
500 			break;
501 		ret = io_register_iowq_aff(ctx, arg, nr_args);
502 		break;
503 	case IORING_UNREGISTER_IOWQ_AFF:
504 		ret = -EINVAL;
505 		if (arg || nr_args)
506 			break;
507 		ret = io_unregister_iowq_aff(ctx);
508 		break;
509 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
510 		ret = -EINVAL;
511 		if (!arg || nr_args != 2)
512 			break;
513 		ret = io_register_iowq_max_workers(ctx, arg);
514 		break;
515 	case IORING_REGISTER_RING_FDS:
516 		ret = io_ringfd_register(ctx, arg, nr_args);
517 		break;
518 	case IORING_UNREGISTER_RING_FDS:
519 		ret = io_ringfd_unregister(ctx, arg, nr_args);
520 		break;
521 	case IORING_REGISTER_PBUF_RING:
522 		ret = -EINVAL;
523 		if (!arg || nr_args != 1)
524 			break;
525 		ret = io_register_pbuf_ring(ctx, arg);
526 		break;
527 	case IORING_UNREGISTER_PBUF_RING:
528 		ret = -EINVAL;
529 		if (!arg || nr_args != 1)
530 			break;
531 		ret = io_unregister_pbuf_ring(ctx, arg);
532 		break;
533 	case IORING_REGISTER_SYNC_CANCEL:
534 		ret = -EINVAL;
535 		if (!arg || nr_args != 1)
536 			break;
537 		ret = io_sync_cancel(ctx, arg);
538 		break;
539 	case IORING_REGISTER_FILE_ALLOC_RANGE:
540 		ret = -EINVAL;
541 		if (!arg || nr_args)
542 			break;
543 		ret = io_register_file_alloc_range(ctx, arg);
544 		break;
545 	case IORING_REGISTER_PBUF_STATUS:
546 		ret = -EINVAL;
547 		if (!arg || nr_args != 1)
548 			break;
549 		ret = io_register_pbuf_status(ctx, arg);
550 		break;
551 	default:
552 		ret = -EINVAL;
553 		break;
554 	}
555 
556 	return ret;
557 }
558 
559 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
560 		void __user *, arg, unsigned int, nr_args)
561 {
562 	struct io_ring_ctx *ctx;
563 	long ret = -EBADF;
564 	struct file *file;
565 	bool use_registered_ring;
566 
567 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
568 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
569 
570 	if (opcode >= IORING_REGISTER_LAST)
571 		return -EINVAL;
572 
573 	if (use_registered_ring) {
574 		/*
575 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
576 		 * need only dereference our task private array to find it.
577 		 */
578 		struct io_uring_task *tctx = current->io_uring;
579 
580 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
581 			return -EINVAL;
582 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
583 		file = tctx->registered_rings[fd];
584 		if (unlikely(!file))
585 			return -EBADF;
586 	} else {
587 		file = fget(fd);
588 		if (unlikely(!file))
589 			return -EBADF;
590 		ret = -EOPNOTSUPP;
591 		if (!io_is_uring_fops(file))
592 			goto out_fput;
593 	}
594 
595 	ctx = file->private_data;
596 
597 	mutex_lock(&ctx->uring_lock);
598 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
599 	mutex_unlock(&ctx->uring_lock);
600 	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
601 out_fput:
602 	if (!use_registered_ring)
603 		fput(file);
604 	return ret;
605 }
606