xref: /linux/io_uring/register.c (revision 3a39d672e7f48b8d6b91a09afa4b55352773b4b5)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30 #include "eventfd.h"
31 
32 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
33 				 IORING_REGISTER_LAST + IORING_OP_LAST)
34 
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)35 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
36 			   unsigned nr_args)
37 {
38 	struct io_uring_probe *p;
39 	size_t size;
40 	int i, ret;
41 
42 	if (nr_args > IORING_OP_LAST)
43 		nr_args = IORING_OP_LAST;
44 
45 	size = struct_size(p, ops, nr_args);
46 	p = kzalloc(size, GFP_KERNEL);
47 	if (!p)
48 		return -ENOMEM;
49 
50 	ret = -EFAULT;
51 	if (copy_from_user(p, arg, size))
52 		goto out;
53 	ret = -EINVAL;
54 	if (memchr_inv(p, 0, size))
55 		goto out;
56 
57 	p->last_op = IORING_OP_LAST - 1;
58 
59 	for (i = 0; i < nr_args; i++) {
60 		p->ops[i].op = i;
61 		if (io_uring_op_supported(i))
62 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
63 	}
64 	p->ops_len = i;
65 
66 	ret = 0;
67 	if (copy_to_user(arg, p, size))
68 		ret = -EFAULT;
69 out:
70 	kfree(p);
71 	return ret;
72 }
73 
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)74 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
75 {
76 	const struct cred *creds;
77 
78 	creds = xa_erase(&ctx->personalities, id);
79 	if (creds) {
80 		put_cred(creds);
81 		return 0;
82 	}
83 
84 	return -EINVAL;
85 }
86 
87 
io_register_personality(struct io_ring_ctx * ctx)88 static int io_register_personality(struct io_ring_ctx *ctx)
89 {
90 	const struct cred *creds;
91 	u32 id;
92 	int ret;
93 
94 	creds = get_current_cred();
95 
96 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
97 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
98 	if (ret < 0) {
99 		put_cred(creds);
100 		return ret;
101 	}
102 	return id;
103 }
104 
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)105 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
106 					   void __user *arg, unsigned int nr_args)
107 {
108 	struct io_uring_restriction *res;
109 	size_t size;
110 	int i, ret;
111 
112 	/* Restrictions allowed only if rings started disabled */
113 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
114 		return -EBADFD;
115 
116 	/* We allow only a single restrictions registration */
117 	if (ctx->restrictions.registered)
118 		return -EBUSY;
119 
120 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
121 		return -EINVAL;
122 
123 	size = array_size(nr_args, sizeof(*res));
124 	if (size == SIZE_MAX)
125 		return -EOVERFLOW;
126 
127 	res = memdup_user(arg, size);
128 	if (IS_ERR(res))
129 		return PTR_ERR(res);
130 
131 	ret = 0;
132 
133 	for (i = 0; i < nr_args; i++) {
134 		switch (res[i].opcode) {
135 		case IORING_RESTRICTION_REGISTER_OP:
136 			if (res[i].register_op >= IORING_REGISTER_LAST) {
137 				ret = -EINVAL;
138 				goto out;
139 			}
140 
141 			__set_bit(res[i].register_op,
142 				  ctx->restrictions.register_op);
143 			break;
144 		case IORING_RESTRICTION_SQE_OP:
145 			if (res[i].sqe_op >= IORING_OP_LAST) {
146 				ret = -EINVAL;
147 				goto out;
148 			}
149 
150 			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
151 			break;
152 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
153 			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
154 			break;
155 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
156 			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
157 			break;
158 		default:
159 			ret = -EINVAL;
160 			goto out;
161 		}
162 	}
163 
164 out:
165 	/* Reset all restrictions if an error happened */
166 	if (ret != 0)
167 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
168 	else
169 		ctx->restrictions.registered = true;
170 
171 	kfree(res);
172 	return ret;
173 }
174 
io_register_enable_rings(struct io_ring_ctx * ctx)175 static int io_register_enable_rings(struct io_ring_ctx *ctx)
176 {
177 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
178 		return -EBADFD;
179 
180 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
181 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
182 		/*
183 		 * Lazy activation attempts would fail if it was polled before
184 		 * submitter_task is set.
185 		 */
186 		if (wq_has_sleeper(&ctx->poll_wq))
187 			io_activate_pollwq(ctx);
188 	}
189 
190 	if (ctx->restrictions.registered)
191 		ctx->restricted = 1;
192 
193 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
194 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
195 		wake_up(&ctx->sq_data->wait);
196 	return 0;
197 }
198 
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)199 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
200 					 cpumask_var_t new_mask)
201 {
202 	int ret;
203 
204 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
205 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
206 	} else {
207 		mutex_unlock(&ctx->uring_lock);
208 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
209 		mutex_lock(&ctx->uring_lock);
210 	}
211 
212 	return ret;
213 }
214 
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)215 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
216 				       void __user *arg, unsigned len)
217 {
218 	cpumask_var_t new_mask;
219 	int ret;
220 
221 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
222 		return -ENOMEM;
223 
224 	cpumask_clear(new_mask);
225 	if (len > cpumask_size())
226 		len = cpumask_size();
227 
228 #ifdef CONFIG_COMPAT
229 	if (in_compat_syscall())
230 		ret = compat_get_bitmap(cpumask_bits(new_mask),
231 					(const compat_ulong_t __user *)arg,
232 					len * 8 /* CHAR_BIT */);
233 	else
234 #endif
235 		ret = copy_from_user(new_mask, arg, len);
236 
237 	if (ret) {
238 		free_cpumask_var(new_mask);
239 		return -EFAULT;
240 	}
241 
242 	ret = __io_register_iowq_aff(ctx, new_mask);
243 	free_cpumask_var(new_mask);
244 	return ret;
245 }
246 
io_unregister_iowq_aff(struct io_ring_ctx * ctx)247 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
248 {
249 	return __io_register_iowq_aff(ctx, NULL);
250 }
251 
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)252 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
253 					       void __user *arg)
254 	__must_hold(&ctx->uring_lock)
255 {
256 	struct io_tctx_node *node;
257 	struct io_uring_task *tctx = NULL;
258 	struct io_sq_data *sqd = NULL;
259 	__u32 new_count[2];
260 	int i, ret;
261 
262 	if (copy_from_user(new_count, arg, sizeof(new_count)))
263 		return -EFAULT;
264 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
265 		if (new_count[i] > INT_MAX)
266 			return -EINVAL;
267 
268 	if (ctx->flags & IORING_SETUP_SQPOLL) {
269 		sqd = ctx->sq_data;
270 		if (sqd) {
271 			/*
272 			 * Observe the correct sqd->lock -> ctx->uring_lock
273 			 * ordering. Fine to drop uring_lock here, we hold
274 			 * a ref to the ctx.
275 			 */
276 			refcount_inc(&sqd->refs);
277 			mutex_unlock(&ctx->uring_lock);
278 			mutex_lock(&sqd->lock);
279 			mutex_lock(&ctx->uring_lock);
280 			if (sqd->thread)
281 				tctx = sqd->thread->io_uring;
282 		}
283 	} else {
284 		tctx = current->io_uring;
285 	}
286 
287 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
288 
289 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
290 		if (new_count[i])
291 			ctx->iowq_limits[i] = new_count[i];
292 	ctx->iowq_limits_set = true;
293 
294 	if (tctx && tctx->io_wq) {
295 		ret = io_wq_max_workers(tctx->io_wq, new_count);
296 		if (ret)
297 			goto err;
298 	} else {
299 		memset(new_count, 0, sizeof(new_count));
300 	}
301 
302 	if (sqd) {
303 		mutex_unlock(&ctx->uring_lock);
304 		mutex_unlock(&sqd->lock);
305 		io_put_sq_data(sqd);
306 		mutex_lock(&ctx->uring_lock);
307 	}
308 
309 	if (copy_to_user(arg, new_count, sizeof(new_count)))
310 		return -EFAULT;
311 
312 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
313 	if (sqd)
314 		return 0;
315 
316 	/* now propagate the restriction to all registered users */
317 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
318 		tctx = node->task->io_uring;
319 		if (WARN_ON_ONCE(!tctx->io_wq))
320 			continue;
321 
322 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
323 			new_count[i] = ctx->iowq_limits[i];
324 		/* ignore errors, it always returns zero anyway */
325 		(void)io_wq_max_workers(tctx->io_wq, new_count);
326 	}
327 	return 0;
328 err:
329 	if (sqd) {
330 		mutex_unlock(&ctx->uring_lock);
331 		mutex_unlock(&sqd->lock);
332 		io_put_sq_data(sqd);
333 		mutex_lock(&ctx->uring_lock);
334 	}
335 	return ret;
336 }
337 
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)338 static int io_register_clock(struct io_ring_ctx *ctx,
339 			     struct io_uring_clock_register __user *arg)
340 {
341 	struct io_uring_clock_register reg;
342 
343 	if (copy_from_user(&reg, arg, sizeof(reg)))
344 		return -EFAULT;
345 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
346 		return -EINVAL;
347 
348 	switch (reg.clockid) {
349 	case CLOCK_MONOTONIC:
350 		ctx->clock_offset = 0;
351 		break;
352 	case CLOCK_BOOTTIME:
353 		ctx->clock_offset = TK_OFFS_BOOT;
354 		break;
355 	default:
356 		return -EINVAL;
357 	}
358 
359 	ctx->clockid = reg.clockid;
360 	return 0;
361 }
362 
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)363 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
364 			       void __user *arg, unsigned nr_args)
365 	__releases(ctx->uring_lock)
366 	__acquires(ctx->uring_lock)
367 {
368 	int ret;
369 
370 	/*
371 	 * We don't quiesce the refs for register anymore and so it can't be
372 	 * dying as we're holding a file ref here.
373 	 */
374 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
375 		return -ENXIO;
376 
377 	if (ctx->submitter_task && ctx->submitter_task != current)
378 		return -EEXIST;
379 
380 	if (ctx->restricted) {
381 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
382 		if (!test_bit(opcode, ctx->restrictions.register_op))
383 			return -EACCES;
384 	}
385 
386 	switch (opcode) {
387 	case IORING_REGISTER_BUFFERS:
388 		ret = -EFAULT;
389 		if (!arg)
390 			break;
391 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
392 		break;
393 	case IORING_UNREGISTER_BUFFERS:
394 		ret = -EINVAL;
395 		if (arg || nr_args)
396 			break;
397 		ret = io_sqe_buffers_unregister(ctx);
398 		break;
399 	case IORING_REGISTER_FILES:
400 		ret = -EFAULT;
401 		if (!arg)
402 			break;
403 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
404 		break;
405 	case IORING_UNREGISTER_FILES:
406 		ret = -EINVAL;
407 		if (arg || nr_args)
408 			break;
409 		ret = io_sqe_files_unregister(ctx);
410 		break;
411 	case IORING_REGISTER_FILES_UPDATE:
412 		ret = io_register_files_update(ctx, arg, nr_args);
413 		break;
414 	case IORING_REGISTER_EVENTFD:
415 		ret = -EINVAL;
416 		if (nr_args != 1)
417 			break;
418 		ret = io_eventfd_register(ctx, arg, 0);
419 		break;
420 	case IORING_REGISTER_EVENTFD_ASYNC:
421 		ret = -EINVAL;
422 		if (nr_args != 1)
423 			break;
424 		ret = io_eventfd_register(ctx, arg, 1);
425 		break;
426 	case IORING_UNREGISTER_EVENTFD:
427 		ret = -EINVAL;
428 		if (arg || nr_args)
429 			break;
430 		ret = io_eventfd_unregister(ctx);
431 		break;
432 	case IORING_REGISTER_PROBE:
433 		ret = -EINVAL;
434 		if (!arg || nr_args > 256)
435 			break;
436 		ret = io_probe(ctx, arg, nr_args);
437 		break;
438 	case IORING_REGISTER_PERSONALITY:
439 		ret = -EINVAL;
440 		if (arg || nr_args)
441 			break;
442 		ret = io_register_personality(ctx);
443 		break;
444 	case IORING_UNREGISTER_PERSONALITY:
445 		ret = -EINVAL;
446 		if (arg)
447 			break;
448 		ret = io_unregister_personality(ctx, nr_args);
449 		break;
450 	case IORING_REGISTER_ENABLE_RINGS:
451 		ret = -EINVAL;
452 		if (arg || nr_args)
453 			break;
454 		ret = io_register_enable_rings(ctx);
455 		break;
456 	case IORING_REGISTER_RESTRICTIONS:
457 		ret = io_register_restrictions(ctx, arg, nr_args);
458 		break;
459 	case IORING_REGISTER_FILES2:
460 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
461 		break;
462 	case IORING_REGISTER_FILES_UPDATE2:
463 		ret = io_register_rsrc_update(ctx, arg, nr_args,
464 					      IORING_RSRC_FILE);
465 		break;
466 	case IORING_REGISTER_BUFFERS2:
467 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
468 		break;
469 	case IORING_REGISTER_BUFFERS_UPDATE:
470 		ret = io_register_rsrc_update(ctx, arg, nr_args,
471 					      IORING_RSRC_BUFFER);
472 		break;
473 	case IORING_REGISTER_IOWQ_AFF:
474 		ret = -EINVAL;
475 		if (!arg || !nr_args)
476 			break;
477 		ret = io_register_iowq_aff(ctx, arg, nr_args);
478 		break;
479 	case IORING_UNREGISTER_IOWQ_AFF:
480 		ret = -EINVAL;
481 		if (arg || nr_args)
482 			break;
483 		ret = io_unregister_iowq_aff(ctx);
484 		break;
485 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
486 		ret = -EINVAL;
487 		if (!arg || nr_args != 2)
488 			break;
489 		ret = io_register_iowq_max_workers(ctx, arg);
490 		break;
491 	case IORING_REGISTER_RING_FDS:
492 		ret = io_ringfd_register(ctx, arg, nr_args);
493 		break;
494 	case IORING_UNREGISTER_RING_FDS:
495 		ret = io_ringfd_unregister(ctx, arg, nr_args);
496 		break;
497 	case IORING_REGISTER_PBUF_RING:
498 		ret = -EINVAL;
499 		if (!arg || nr_args != 1)
500 			break;
501 		ret = io_register_pbuf_ring(ctx, arg);
502 		break;
503 	case IORING_UNREGISTER_PBUF_RING:
504 		ret = -EINVAL;
505 		if (!arg || nr_args != 1)
506 			break;
507 		ret = io_unregister_pbuf_ring(ctx, arg);
508 		break;
509 	case IORING_REGISTER_SYNC_CANCEL:
510 		ret = -EINVAL;
511 		if (!arg || nr_args != 1)
512 			break;
513 		ret = io_sync_cancel(ctx, arg);
514 		break;
515 	case IORING_REGISTER_FILE_ALLOC_RANGE:
516 		ret = -EINVAL;
517 		if (!arg || nr_args)
518 			break;
519 		ret = io_register_file_alloc_range(ctx, arg);
520 		break;
521 	case IORING_REGISTER_PBUF_STATUS:
522 		ret = -EINVAL;
523 		if (!arg || nr_args != 1)
524 			break;
525 		ret = io_register_pbuf_status(ctx, arg);
526 		break;
527 	case IORING_REGISTER_NAPI:
528 		ret = -EINVAL;
529 		if (!arg || nr_args != 1)
530 			break;
531 		ret = io_register_napi(ctx, arg);
532 		break;
533 	case IORING_UNREGISTER_NAPI:
534 		ret = -EINVAL;
535 		if (nr_args != 1)
536 			break;
537 		ret = io_unregister_napi(ctx, arg);
538 		break;
539 	case IORING_REGISTER_CLOCK:
540 		ret = -EINVAL;
541 		if (!arg || nr_args)
542 			break;
543 		ret = io_register_clock(ctx, arg);
544 		break;
545 	case IORING_REGISTER_CLONE_BUFFERS:
546 		ret = -EINVAL;
547 		if (!arg || nr_args != 1)
548 			break;
549 		ret = io_register_clone_buffers(ctx, arg);
550 		break;
551 	default:
552 		ret = -EINVAL;
553 		break;
554 	}
555 
556 	return ret;
557 }
558 
559 /*
560  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
561  * true, then the registered index is used. Otherwise, the normal fd table.
562  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
563  */
io_uring_register_get_file(unsigned int fd,bool registered)564 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
565 {
566 	struct file *file;
567 
568 	if (registered) {
569 		/*
570 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
571 		 * need only dereference our task private array to find it.
572 		 */
573 		struct io_uring_task *tctx = current->io_uring;
574 
575 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
576 			return ERR_PTR(-EINVAL);
577 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
578 		file = tctx->registered_rings[fd];
579 	} else {
580 		file = fget(fd);
581 	}
582 
583 	if (unlikely(!file))
584 		return ERR_PTR(-EBADF);
585 	if (io_is_uring_fops(file))
586 		return file;
587 	fput(file);
588 	return ERR_PTR(-EOPNOTSUPP);
589 }
590 
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)591 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
592 		void __user *, arg, unsigned int, nr_args)
593 {
594 	struct io_ring_ctx *ctx;
595 	long ret = -EBADF;
596 	struct file *file;
597 	bool use_registered_ring;
598 
599 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
600 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
601 
602 	if (opcode >= IORING_REGISTER_LAST)
603 		return -EINVAL;
604 
605 	file = io_uring_register_get_file(fd, use_registered_ring);
606 	if (IS_ERR(file))
607 		return PTR_ERR(file);
608 	ctx = file->private_data;
609 
610 	mutex_lock(&ctx->uring_lock);
611 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
612 	mutex_unlock(&ctx->uring_lock);
613 	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
614 	if (!use_registered_ring)
615 		fput(file);
616 	return ret;
617 }
618