1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/fs.h>
5 #include <linux/file.h>
6 #include <linux/mm.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
12
13 #include <uapi/linux/io_uring.h>
14
15 #include "io_uring.h"
16 #include "alloc_cache.h"
17 #include "openclose.h"
18 #include "rsrc.h"
19 #include "memmap.h"
20 #include "register.h"
21
22 struct io_rsrc_update {
23 struct file *file;
24 u64 arg;
25 u32 nr_args;
26 u32 offset;
27 };
28
29 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
30 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
31 struct io_mapped_ubuf **pimu,
32 struct page **last_hpage);
33
34 /* only define max */
35 #define IORING_MAX_FIXED_FILES (1U << 20)
36 #define IORING_MAX_REG_BUFFERS (1U << 14)
37
38 static const struct io_mapped_ubuf dummy_ubuf = {
39 /* set invalid range, so io_import_fixed() fails meeting it */
40 .ubuf = -1UL,
41 .len = UINT_MAX,
42 };
43
__io_account_mem(struct user_struct * user,unsigned long nr_pages)44 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
45 {
46 unsigned long page_limit, cur_pages, new_pages;
47
48 if (!nr_pages)
49 return 0;
50
51 /* Don't allow more pages than we can safely lock */
52 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
53
54 cur_pages = atomic_long_read(&user->locked_vm);
55 do {
56 new_pages = cur_pages + nr_pages;
57 if (new_pages > page_limit)
58 return -ENOMEM;
59 } while (!atomic_long_try_cmpxchg(&user->locked_vm,
60 &cur_pages, new_pages));
61 return 0;
62 }
63
io_unaccount_mem(struct io_ring_ctx * ctx,unsigned long nr_pages)64 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
65 {
66 if (ctx->user)
67 __io_unaccount_mem(ctx->user, nr_pages);
68
69 if (ctx->mm_account)
70 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
71 }
72
io_account_mem(struct io_ring_ctx * ctx,unsigned long nr_pages)73 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
74 {
75 int ret;
76
77 if (ctx->user) {
78 ret = __io_account_mem(ctx->user, nr_pages);
79 if (ret)
80 return ret;
81 }
82
83 if (ctx->mm_account)
84 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
85
86 return 0;
87 }
88
io_buffer_validate(struct iovec * iov)89 static int io_buffer_validate(struct iovec *iov)
90 {
91 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
92
93 /*
94 * Don't impose further limits on the size and buffer
95 * constraints here, we'll -EINVAL later when IO is
96 * submitted if they are wrong.
97 */
98 if (!iov->iov_base)
99 return iov->iov_len ? -EFAULT : 0;
100 if (!iov->iov_len)
101 return -EFAULT;
102
103 /* arbitrary limit, but we need something */
104 if (iov->iov_len > SZ_1G)
105 return -EFAULT;
106
107 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
108 return -EOVERFLOW;
109
110 return 0;
111 }
112
io_buffer_unmap(struct io_ring_ctx * ctx,struct io_mapped_ubuf ** slot)113 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
114 {
115 struct io_mapped_ubuf *imu = *slot;
116 unsigned int i;
117
118 *slot = NULL;
119 if (imu != &dummy_ubuf) {
120 if (!refcount_dec_and_test(&imu->refs))
121 return;
122 for (i = 0; i < imu->nr_bvecs; i++)
123 unpin_user_page(imu->bvec[i].bv_page);
124 if (imu->acct_pages)
125 io_unaccount_mem(ctx, imu->acct_pages);
126 kvfree(imu);
127 }
128 }
129
io_rsrc_put_work(struct io_rsrc_node * node)130 static void io_rsrc_put_work(struct io_rsrc_node *node)
131 {
132 struct io_rsrc_put *prsrc = &node->item;
133
134 if (prsrc->tag)
135 io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0);
136
137 switch (node->type) {
138 case IORING_RSRC_FILE:
139 fput(prsrc->file);
140 break;
141 case IORING_RSRC_BUFFER:
142 io_rsrc_buf_put(node->ctx, prsrc);
143 break;
144 default:
145 WARN_ON_ONCE(1);
146 break;
147 }
148 }
149
io_rsrc_node_destroy(struct io_ring_ctx * ctx,struct io_rsrc_node * node)150 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
151 {
152 if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
153 kfree(node);
154 }
155
io_rsrc_node_ref_zero(struct io_rsrc_node * node)156 void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
157 __must_hold(&node->ctx->uring_lock)
158 {
159 struct io_ring_ctx *ctx = node->ctx;
160
161 while (!list_empty(&ctx->rsrc_ref_list)) {
162 node = list_first_entry(&ctx->rsrc_ref_list,
163 struct io_rsrc_node, node);
164 /* recycle ref nodes in order */
165 if (node->refs)
166 break;
167 list_del(&node->node);
168
169 if (likely(!node->empty))
170 io_rsrc_put_work(node);
171 io_rsrc_node_destroy(ctx, node);
172 }
173 if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
174 wake_up_all(&ctx->rsrc_quiesce_wq);
175 }
176
io_rsrc_node_alloc(struct io_ring_ctx * ctx)177 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
178 {
179 struct io_rsrc_node *ref_node;
180
181 ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache);
182 if (!ref_node) {
183 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
184 if (!ref_node)
185 return NULL;
186 }
187
188 ref_node->ctx = ctx;
189 ref_node->empty = 0;
190 ref_node->refs = 1;
191 return ref_node;
192 }
193
io_rsrc_ref_quiesce(struct io_rsrc_data * data,struct io_ring_ctx * ctx)194 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
195 struct io_ring_ctx *ctx)
196 {
197 struct io_rsrc_node *backup;
198 DEFINE_WAIT(we);
199 int ret;
200
201 /* As We may drop ->uring_lock, other task may have started quiesce */
202 if (data->quiesce)
203 return -ENXIO;
204
205 backup = io_rsrc_node_alloc(ctx);
206 if (!backup)
207 return -ENOMEM;
208 ctx->rsrc_node->empty = true;
209 ctx->rsrc_node->type = -1;
210 list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
211 io_put_rsrc_node(ctx, ctx->rsrc_node);
212 ctx->rsrc_node = backup;
213
214 if (list_empty(&ctx->rsrc_ref_list))
215 return 0;
216
217 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
218 atomic_set(&ctx->cq_wait_nr, 1);
219 smp_mb();
220 }
221
222 ctx->rsrc_quiesce++;
223 data->quiesce = true;
224 do {
225 prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
226 mutex_unlock(&ctx->uring_lock);
227
228 ret = io_run_task_work_sig(ctx);
229 if (ret < 0) {
230 finish_wait(&ctx->rsrc_quiesce_wq, &we);
231 mutex_lock(&ctx->uring_lock);
232 if (list_empty(&ctx->rsrc_ref_list))
233 ret = 0;
234 break;
235 }
236
237 schedule();
238 mutex_lock(&ctx->uring_lock);
239 ret = 0;
240 } while (!list_empty(&ctx->rsrc_ref_list));
241
242 finish_wait(&ctx->rsrc_quiesce_wq, &we);
243 data->quiesce = false;
244 ctx->rsrc_quiesce--;
245
246 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
247 atomic_set(&ctx->cq_wait_nr, 0);
248 smp_mb();
249 }
250 return ret;
251 }
252
io_free_page_table(void ** table,size_t size)253 static void io_free_page_table(void **table, size_t size)
254 {
255 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
256
257 for (i = 0; i < nr_tables; i++)
258 kfree(table[i]);
259 kfree(table);
260 }
261
io_rsrc_data_free(struct io_rsrc_data * data)262 static void io_rsrc_data_free(struct io_rsrc_data *data)
263 {
264 size_t size = data->nr * sizeof(data->tags[0][0]);
265
266 if (data->tags)
267 io_free_page_table((void **)data->tags, size);
268 kfree(data);
269 }
270
io_alloc_page_table(size_t size)271 static __cold void **io_alloc_page_table(size_t size)
272 {
273 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
274 size_t init_size = size;
275 void **table;
276
277 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
278 if (!table)
279 return NULL;
280
281 for (i = 0; i < nr_tables; i++) {
282 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
283
284 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
285 if (!table[i]) {
286 io_free_page_table(table, init_size);
287 return NULL;
288 }
289 size -= this_size;
290 }
291 return table;
292 }
293
io_rsrc_data_alloc(struct io_ring_ctx * ctx,int type,u64 __user * utags,unsigned nr,struct io_rsrc_data ** pdata)294 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type,
295 u64 __user *utags,
296 unsigned nr, struct io_rsrc_data **pdata)
297 {
298 struct io_rsrc_data *data;
299 int ret = 0;
300 unsigned i;
301
302 data = kzalloc(sizeof(*data), GFP_KERNEL);
303 if (!data)
304 return -ENOMEM;
305 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
306 if (!data->tags) {
307 kfree(data);
308 return -ENOMEM;
309 }
310
311 data->nr = nr;
312 data->ctx = ctx;
313 data->rsrc_type = type;
314 if (utags) {
315 ret = -EFAULT;
316 for (i = 0; i < nr; i++) {
317 u64 *tag_slot = io_get_tag_slot(data, i);
318
319 if (copy_from_user(tag_slot, &utags[i],
320 sizeof(*tag_slot)))
321 goto fail;
322 }
323 }
324 *pdata = data;
325 return 0;
326 fail:
327 io_rsrc_data_free(data);
328 return ret;
329 }
330
__io_sqe_files_update(struct io_ring_ctx * ctx,struct io_uring_rsrc_update2 * up,unsigned nr_args)331 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
332 struct io_uring_rsrc_update2 *up,
333 unsigned nr_args)
334 {
335 u64 __user *tags = u64_to_user_ptr(up->tags);
336 __s32 __user *fds = u64_to_user_ptr(up->data);
337 struct io_rsrc_data *data = ctx->file_data;
338 struct io_fixed_file *file_slot;
339 int fd, i, err = 0;
340 unsigned int done;
341
342 if (!ctx->file_data)
343 return -ENXIO;
344 if (up->offset + nr_args > ctx->nr_user_files)
345 return -EINVAL;
346
347 for (done = 0; done < nr_args; done++) {
348 u64 tag = 0;
349
350 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
351 copy_from_user(&fd, &fds[done], sizeof(fd))) {
352 err = -EFAULT;
353 break;
354 }
355 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
356 err = -EINVAL;
357 break;
358 }
359 if (fd == IORING_REGISTER_FILES_SKIP)
360 continue;
361
362 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
363 file_slot = io_fixed_file_slot(&ctx->file_table, i);
364
365 if (file_slot->file_ptr) {
366 err = io_queue_rsrc_removal(data, i,
367 io_slot_file(file_slot));
368 if (err)
369 break;
370 file_slot->file_ptr = 0;
371 io_file_bitmap_clear(&ctx->file_table, i);
372 }
373 if (fd != -1) {
374 struct file *file = fget(fd);
375
376 if (!file) {
377 err = -EBADF;
378 break;
379 }
380 /*
381 * Don't allow io_uring instances to be registered.
382 */
383 if (io_is_uring_fops(file)) {
384 fput(file);
385 err = -EBADF;
386 break;
387 }
388 *io_get_tag_slot(data, i) = tag;
389 io_fixed_file_set(file_slot, file);
390 io_file_bitmap_set(&ctx->file_table, i);
391 }
392 }
393 return done ? done : err;
394 }
395
__io_sqe_buffers_update(struct io_ring_ctx * ctx,struct io_uring_rsrc_update2 * up,unsigned int nr_args)396 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
397 struct io_uring_rsrc_update2 *up,
398 unsigned int nr_args)
399 {
400 u64 __user *tags = u64_to_user_ptr(up->tags);
401 struct iovec fast_iov, *iov;
402 struct page *last_hpage = NULL;
403 struct iovec __user *uvec;
404 u64 user_data = up->data;
405 __u32 done;
406 int i, err;
407
408 if (!ctx->buf_data)
409 return -ENXIO;
410 if (up->offset + nr_args > ctx->nr_user_bufs)
411 return -EINVAL;
412
413 for (done = 0; done < nr_args; done++) {
414 struct io_mapped_ubuf *imu;
415 u64 tag = 0;
416
417 uvec = u64_to_user_ptr(user_data);
418 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
419 if (IS_ERR(iov)) {
420 err = PTR_ERR(iov);
421 break;
422 }
423 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
424 err = -EFAULT;
425 break;
426 }
427 err = io_buffer_validate(iov);
428 if (err)
429 break;
430 if (!iov->iov_base && tag) {
431 err = -EINVAL;
432 break;
433 }
434 err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage);
435 if (err)
436 break;
437
438 i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
439 if (ctx->user_bufs[i] != &dummy_ubuf) {
440 err = io_queue_rsrc_removal(ctx->buf_data, i,
441 ctx->user_bufs[i]);
442 if (unlikely(err)) {
443 io_buffer_unmap(ctx, &imu);
444 break;
445 }
446 ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
447 }
448
449 ctx->user_bufs[i] = imu;
450 *io_get_tag_slot(ctx->buf_data, i) = tag;
451 if (ctx->compat)
452 user_data += sizeof(struct compat_iovec);
453 else
454 user_data += sizeof(struct iovec);
455 }
456 return done ? done : err;
457 }
458
__io_register_rsrc_update(struct io_ring_ctx * ctx,unsigned type,struct io_uring_rsrc_update2 * up,unsigned nr_args)459 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
460 struct io_uring_rsrc_update2 *up,
461 unsigned nr_args)
462 {
463 __u32 tmp;
464
465 lockdep_assert_held(&ctx->uring_lock);
466
467 if (check_add_overflow(up->offset, nr_args, &tmp))
468 return -EOVERFLOW;
469
470 switch (type) {
471 case IORING_RSRC_FILE:
472 return __io_sqe_files_update(ctx, up, nr_args);
473 case IORING_RSRC_BUFFER:
474 return __io_sqe_buffers_update(ctx, up, nr_args);
475 }
476 return -EINVAL;
477 }
478
io_register_files_update(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)479 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
480 unsigned nr_args)
481 {
482 struct io_uring_rsrc_update2 up;
483
484 if (!nr_args)
485 return -EINVAL;
486 memset(&up, 0, sizeof(up));
487 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
488 return -EFAULT;
489 if (up.resv || up.resv2)
490 return -EINVAL;
491 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
492 }
493
io_register_rsrc_update(struct io_ring_ctx * ctx,void __user * arg,unsigned size,unsigned type)494 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
495 unsigned size, unsigned type)
496 {
497 struct io_uring_rsrc_update2 up;
498
499 if (size != sizeof(up))
500 return -EINVAL;
501 if (copy_from_user(&up, arg, sizeof(up)))
502 return -EFAULT;
503 if (!up.nr || up.resv || up.resv2)
504 return -EINVAL;
505 return __io_register_rsrc_update(ctx, type, &up, up.nr);
506 }
507
io_register_rsrc(struct io_ring_ctx * ctx,void __user * arg,unsigned int size,unsigned int type)508 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
509 unsigned int size, unsigned int type)
510 {
511 struct io_uring_rsrc_register rr;
512
513 /* keep it extendible */
514 if (size != sizeof(rr))
515 return -EINVAL;
516
517 memset(&rr, 0, sizeof(rr));
518 if (copy_from_user(&rr, arg, size))
519 return -EFAULT;
520 if (!rr.nr || rr.resv2)
521 return -EINVAL;
522 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
523 return -EINVAL;
524
525 switch (type) {
526 case IORING_RSRC_FILE:
527 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
528 break;
529 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
530 rr.nr, u64_to_user_ptr(rr.tags));
531 case IORING_RSRC_BUFFER:
532 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
533 break;
534 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
535 rr.nr, u64_to_user_ptr(rr.tags));
536 }
537 return -EINVAL;
538 }
539
io_files_update_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)540 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
541 {
542 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
543
544 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
545 return -EINVAL;
546 if (sqe->rw_flags || sqe->splice_fd_in)
547 return -EINVAL;
548
549 up->offset = READ_ONCE(sqe->off);
550 up->nr_args = READ_ONCE(sqe->len);
551 if (!up->nr_args)
552 return -EINVAL;
553 up->arg = READ_ONCE(sqe->addr);
554 return 0;
555 }
556
io_files_update_with_index_alloc(struct io_kiocb * req,unsigned int issue_flags)557 static int io_files_update_with_index_alloc(struct io_kiocb *req,
558 unsigned int issue_flags)
559 {
560 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
561 __s32 __user *fds = u64_to_user_ptr(up->arg);
562 unsigned int done;
563 struct file *file;
564 int ret, fd;
565
566 if (!req->ctx->file_data)
567 return -ENXIO;
568
569 for (done = 0; done < up->nr_args; done++) {
570 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
571 ret = -EFAULT;
572 break;
573 }
574
575 file = fget(fd);
576 if (!file) {
577 ret = -EBADF;
578 break;
579 }
580 ret = io_fixed_fd_install(req, issue_flags, file,
581 IORING_FILE_INDEX_ALLOC);
582 if (ret < 0)
583 break;
584 if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
585 __io_close_fixed(req->ctx, issue_flags, ret);
586 ret = -EFAULT;
587 break;
588 }
589 }
590
591 if (done)
592 return done;
593 return ret;
594 }
595
io_files_update(struct io_kiocb * req,unsigned int issue_flags)596 int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
597 {
598 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
599 struct io_ring_ctx *ctx = req->ctx;
600 struct io_uring_rsrc_update2 up2;
601 int ret;
602
603 up2.offset = up->offset;
604 up2.data = up->arg;
605 up2.nr = 0;
606 up2.tags = 0;
607 up2.resv = 0;
608 up2.resv2 = 0;
609
610 if (up->offset == IORING_FILE_INDEX_ALLOC) {
611 ret = io_files_update_with_index_alloc(req, issue_flags);
612 } else {
613 io_ring_submit_lock(ctx, issue_flags);
614 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
615 &up2, up->nr_args);
616 io_ring_submit_unlock(ctx, issue_flags);
617 }
618
619 if (ret < 0)
620 req_set_fail(req);
621 io_req_set_res(req, ret, 0);
622 return IOU_OK;
623 }
624
io_queue_rsrc_removal(struct io_rsrc_data * data,unsigned idx,void * rsrc)625 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
626 {
627 struct io_ring_ctx *ctx = data->ctx;
628 struct io_rsrc_node *node = ctx->rsrc_node;
629 u64 *tag_slot = io_get_tag_slot(data, idx);
630
631 ctx->rsrc_node = io_rsrc_node_alloc(ctx);
632 if (unlikely(!ctx->rsrc_node)) {
633 ctx->rsrc_node = node;
634 return -ENOMEM;
635 }
636
637 node->item.rsrc = rsrc;
638 node->type = data->rsrc_type;
639 node->item.tag = *tag_slot;
640 *tag_slot = 0;
641 list_add_tail(&node->node, &ctx->rsrc_ref_list);
642 io_put_rsrc_node(ctx, node);
643 return 0;
644 }
645
__io_sqe_files_unregister(struct io_ring_ctx * ctx)646 void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
647 {
648 int i;
649
650 for (i = 0; i < ctx->nr_user_files; i++) {
651 struct file *file = io_file_from_index(&ctx->file_table, i);
652
653 if (!file)
654 continue;
655 io_file_bitmap_clear(&ctx->file_table, i);
656 fput(file);
657 }
658
659 io_free_file_tables(&ctx->file_table);
660 io_file_table_set_alloc_range(ctx, 0, 0);
661 io_rsrc_data_free(ctx->file_data);
662 ctx->file_data = NULL;
663 ctx->nr_user_files = 0;
664 }
665
io_sqe_files_unregister(struct io_ring_ctx * ctx)666 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
667 {
668 unsigned nr = ctx->nr_user_files;
669 int ret;
670
671 if (!ctx->file_data)
672 return -ENXIO;
673
674 /*
675 * Quiesce may unlock ->uring_lock, and while it's not held
676 * prevent new requests using the table.
677 */
678 ctx->nr_user_files = 0;
679 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
680 ctx->nr_user_files = nr;
681 if (!ret)
682 __io_sqe_files_unregister(ctx);
683 return ret;
684 }
685
io_sqe_files_register(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args,u64 __user * tags)686 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
687 unsigned nr_args, u64 __user *tags)
688 {
689 __s32 __user *fds = (__s32 __user *) arg;
690 struct file *file;
691 int fd, ret;
692 unsigned i;
693
694 if (ctx->file_data)
695 return -EBUSY;
696 if (!nr_args)
697 return -EINVAL;
698 if (nr_args > IORING_MAX_FIXED_FILES)
699 return -EMFILE;
700 if (nr_args > rlimit(RLIMIT_NOFILE))
701 return -EMFILE;
702 ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args,
703 &ctx->file_data);
704 if (ret)
705 return ret;
706
707 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
708 io_rsrc_data_free(ctx->file_data);
709 ctx->file_data = NULL;
710 return -ENOMEM;
711 }
712
713 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
714 struct io_fixed_file *file_slot;
715
716 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
717 ret = -EFAULT;
718 goto fail;
719 }
720 /* allow sparse sets */
721 if (!fds || fd == -1) {
722 ret = -EINVAL;
723 if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
724 goto fail;
725 continue;
726 }
727
728 file = fget(fd);
729 ret = -EBADF;
730 if (unlikely(!file))
731 goto fail;
732
733 /*
734 * Don't allow io_uring instances to be registered.
735 */
736 if (io_is_uring_fops(file)) {
737 fput(file);
738 goto fail;
739 }
740 file_slot = io_fixed_file_slot(&ctx->file_table, i);
741 io_fixed_file_set(file_slot, file);
742 io_file_bitmap_set(&ctx->file_table, i);
743 }
744
745 /* default it to the whole table */
746 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
747 return 0;
748 fail:
749 __io_sqe_files_unregister(ctx);
750 return ret;
751 }
752
io_rsrc_buf_put(struct io_ring_ctx * ctx,struct io_rsrc_put * prsrc)753 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
754 {
755 io_buffer_unmap(ctx, &prsrc->buf);
756 prsrc->buf = NULL;
757 }
758
__io_sqe_buffers_unregister(struct io_ring_ctx * ctx)759 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
760 {
761 unsigned int i;
762
763 for (i = 0; i < ctx->nr_user_bufs; i++)
764 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
765 kfree(ctx->user_bufs);
766 io_rsrc_data_free(ctx->buf_data);
767 ctx->user_bufs = NULL;
768 ctx->buf_data = NULL;
769 ctx->nr_user_bufs = 0;
770 }
771
io_sqe_buffers_unregister(struct io_ring_ctx * ctx)772 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
773 {
774 unsigned nr = ctx->nr_user_bufs;
775 int ret;
776
777 if (!ctx->buf_data)
778 return -ENXIO;
779
780 /*
781 * Quiesce may unlock ->uring_lock, and while it's not held
782 * prevent new requests using the table.
783 */
784 ctx->nr_user_bufs = 0;
785 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
786 ctx->nr_user_bufs = nr;
787 if (!ret)
788 __io_sqe_buffers_unregister(ctx);
789 return ret;
790 }
791
792 /*
793 * Not super efficient, but this is just a registration time. And we do cache
794 * the last compound head, so generally we'll only do a full search if we don't
795 * match that one.
796 *
797 * We check if the given compound head page has already been accounted, to
798 * avoid double accounting it. This allows us to account the full size of the
799 * page, not just the constituent pages of a huge page.
800 */
headpage_already_acct(struct io_ring_ctx * ctx,struct page ** pages,int nr_pages,struct page * hpage)801 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
802 int nr_pages, struct page *hpage)
803 {
804 int i, j;
805
806 /* check current page array */
807 for (i = 0; i < nr_pages; i++) {
808 if (!PageCompound(pages[i]))
809 continue;
810 if (compound_head(pages[i]) == hpage)
811 return true;
812 }
813
814 /* check previously registered pages */
815 for (i = 0; i < ctx->nr_user_bufs; i++) {
816 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
817
818 for (j = 0; j < imu->nr_bvecs; j++) {
819 if (!PageCompound(imu->bvec[j].bv_page))
820 continue;
821 if (compound_head(imu->bvec[j].bv_page) == hpage)
822 return true;
823 }
824 }
825
826 return false;
827 }
828
io_buffer_account_pin(struct io_ring_ctx * ctx,struct page ** pages,int nr_pages,struct io_mapped_ubuf * imu,struct page ** last_hpage)829 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
830 int nr_pages, struct io_mapped_ubuf *imu,
831 struct page **last_hpage)
832 {
833 int i, ret;
834
835 imu->acct_pages = 0;
836 for (i = 0; i < nr_pages; i++) {
837 if (!PageCompound(pages[i])) {
838 imu->acct_pages++;
839 } else {
840 struct page *hpage;
841
842 hpage = compound_head(pages[i]);
843 if (hpage == *last_hpage)
844 continue;
845 *last_hpage = hpage;
846 if (headpage_already_acct(ctx, pages, i, hpage))
847 continue;
848 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
849 }
850 }
851
852 if (!imu->acct_pages)
853 return 0;
854
855 ret = io_account_mem(ctx, imu->acct_pages);
856 if (ret)
857 imu->acct_pages = 0;
858 return ret;
859 }
860
io_do_coalesce_buffer(struct page *** pages,int * nr_pages,struct io_imu_folio_data * data,int nr_folios)861 static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
862 struct io_imu_folio_data *data, int nr_folios)
863 {
864 struct page **page_array = *pages, **new_array = NULL;
865 int nr_pages_left = *nr_pages, i, j;
866
867 /* Store head pages only*/
868 new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
869 GFP_KERNEL);
870 if (!new_array)
871 return false;
872
873 new_array[0] = compound_head(page_array[0]);
874 /*
875 * The pages are bound to the folio, it doesn't
876 * actually unpin them but drops all but one reference,
877 * which is usually put down by io_buffer_unmap().
878 * Note, needs a better helper.
879 */
880 if (data->nr_pages_head > 1)
881 unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
882
883 j = data->nr_pages_head;
884 nr_pages_left -= data->nr_pages_head;
885 for (i = 1; i < nr_folios; i++) {
886 unsigned int nr_unpin;
887
888 new_array[i] = page_array[j];
889 nr_unpin = min_t(unsigned int, nr_pages_left - 1,
890 data->nr_pages_mid - 1);
891 if (nr_unpin)
892 unpin_user_pages(&page_array[j+1], nr_unpin);
893 j += data->nr_pages_mid;
894 nr_pages_left -= data->nr_pages_mid;
895 }
896 kvfree(page_array);
897 *pages = new_array;
898 *nr_pages = nr_folios;
899 return true;
900 }
901
io_try_coalesce_buffer(struct page *** pages,int * nr_pages,struct io_imu_folio_data * data)902 static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
903 struct io_imu_folio_data *data)
904 {
905 struct page **page_array = *pages;
906 struct folio *folio = page_folio(page_array[0]);
907 unsigned int count = 1, nr_folios = 1;
908 int i;
909
910 if (*nr_pages <= 1)
911 return false;
912
913 data->nr_pages_mid = folio_nr_pages(folio);
914 if (data->nr_pages_mid == 1)
915 return false;
916
917 data->folio_shift = folio_shift(folio);
918 /*
919 * Check if pages are contiguous inside a folio, and all folios have
920 * the same page count except for the head and tail.
921 */
922 for (i = 1; i < *nr_pages; i++) {
923 if (page_folio(page_array[i]) == folio &&
924 page_array[i] == page_array[i-1] + 1) {
925 count++;
926 continue;
927 }
928
929 if (nr_folios == 1) {
930 if (folio_page_idx(folio, page_array[i-1]) !=
931 data->nr_pages_mid - 1)
932 return false;
933
934 data->nr_pages_head = count;
935 } else if (count != data->nr_pages_mid) {
936 return false;
937 }
938
939 folio = page_folio(page_array[i]);
940 if (folio_size(folio) != (1UL << data->folio_shift) ||
941 folio_page_idx(folio, page_array[i]) != 0)
942 return false;
943
944 count = 1;
945 nr_folios++;
946 }
947 if (nr_folios == 1)
948 data->nr_pages_head = count;
949
950 return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
951 }
952
io_sqe_buffer_register(struct io_ring_ctx * ctx,struct iovec * iov,struct io_mapped_ubuf ** pimu,struct page ** last_hpage)953 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
954 struct io_mapped_ubuf **pimu,
955 struct page **last_hpage)
956 {
957 struct io_mapped_ubuf *imu = NULL;
958 struct page **pages = NULL;
959 unsigned long off;
960 size_t size;
961 int ret, nr_pages, i;
962 struct io_imu_folio_data data;
963 bool coalesced;
964
965 *pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
966 if (!iov->iov_base)
967 return 0;
968
969 ret = -ENOMEM;
970 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
971 &nr_pages);
972 if (IS_ERR(pages)) {
973 ret = PTR_ERR(pages);
974 pages = NULL;
975 goto done;
976 }
977
978 /* If it's huge page(s), try to coalesce them into fewer bvec entries */
979 coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
980
981 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
982 if (!imu)
983 goto done;
984
985 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
986 if (ret) {
987 unpin_user_pages(pages, nr_pages);
988 goto done;
989 }
990
991 size = iov->iov_len;
992 /* store original address for later verification */
993 imu->ubuf = (unsigned long) iov->iov_base;
994 imu->len = iov->iov_len;
995 imu->nr_bvecs = nr_pages;
996 imu->folio_shift = PAGE_SHIFT;
997 if (coalesced)
998 imu->folio_shift = data.folio_shift;
999 refcount_set(&imu->refs, 1);
1000 off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
1001 *pimu = imu;
1002 ret = 0;
1003
1004 for (i = 0; i < nr_pages; i++) {
1005 size_t vec_len;
1006
1007 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
1008 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
1009 off = 0;
1010 size -= vec_len;
1011 }
1012 done:
1013 if (ret)
1014 kvfree(imu);
1015 kvfree(pages);
1016 return ret;
1017 }
1018
io_buffers_map_alloc(struct io_ring_ctx * ctx,unsigned int nr_args)1019 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
1020 {
1021 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
1022 return ctx->user_bufs ? 0 : -ENOMEM;
1023 }
1024
io_sqe_buffers_register(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args,u64 __user * tags)1025 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
1026 unsigned int nr_args, u64 __user *tags)
1027 {
1028 struct page *last_hpage = NULL;
1029 struct io_rsrc_data *data;
1030 struct iovec fast_iov, *iov = &fast_iov;
1031 const struct iovec __user *uvec;
1032 int i, ret;
1033
1034 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
1035
1036 if (ctx->user_bufs)
1037 return -EBUSY;
1038 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
1039 return -EINVAL;
1040 ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data);
1041 if (ret)
1042 return ret;
1043 ret = io_buffers_map_alloc(ctx, nr_args);
1044 if (ret) {
1045 io_rsrc_data_free(data);
1046 return ret;
1047 }
1048
1049 if (!arg)
1050 memset(iov, 0, sizeof(*iov));
1051
1052 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
1053 if (arg) {
1054 uvec = (struct iovec __user *) arg;
1055 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
1056 if (IS_ERR(iov)) {
1057 ret = PTR_ERR(iov);
1058 break;
1059 }
1060 ret = io_buffer_validate(iov);
1061 if (ret)
1062 break;
1063 if (ctx->compat)
1064 arg += sizeof(struct compat_iovec);
1065 else
1066 arg += sizeof(struct iovec);
1067 }
1068
1069 if (!iov->iov_base && *io_get_tag_slot(data, i)) {
1070 ret = -EINVAL;
1071 break;
1072 }
1073
1074 ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i],
1075 &last_hpage);
1076 if (ret)
1077 break;
1078 }
1079
1080 WARN_ON_ONCE(ctx->buf_data);
1081
1082 ctx->buf_data = data;
1083 if (ret)
1084 __io_sqe_buffers_unregister(ctx);
1085 return ret;
1086 }
1087
io_import_fixed(int ddir,struct iov_iter * iter,struct io_mapped_ubuf * imu,u64 buf_addr,size_t len)1088 int io_import_fixed(int ddir, struct iov_iter *iter,
1089 struct io_mapped_ubuf *imu,
1090 u64 buf_addr, size_t len)
1091 {
1092 u64 buf_end;
1093 size_t offset;
1094
1095 if (WARN_ON_ONCE(!imu))
1096 return -EFAULT;
1097 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1098 return -EFAULT;
1099 /* not inside the mapped region */
1100 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
1101 return -EFAULT;
1102
1103 /*
1104 * Might not be a start of buffer, set size appropriately
1105 * and advance us to the beginning.
1106 */
1107 offset = buf_addr - imu->ubuf;
1108 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
1109
1110 if (offset) {
1111 /*
1112 * Don't use iov_iter_advance() here, as it's really slow for
1113 * using the latter parts of a big fixed buffer - it iterates
1114 * over each segment manually. We can cheat a bit here, because
1115 * we know that:
1116 *
1117 * 1) it's a BVEC iter, we set it up
1118 * 2) all bvecs are the same in size, except potentially the
1119 * first and last bvec
1120 *
1121 * So just find our index, and adjust the iterator afterwards.
1122 * If the offset is within the first bvec (or the whole first
1123 * bvec, just use iov_iter_advance(). This makes it easier
1124 * since we can just skip the first segment, which may not
1125 * be folio_size aligned.
1126 */
1127 const struct bio_vec *bvec = imu->bvec;
1128
1129 if (offset < bvec->bv_len) {
1130 iter->bvec = bvec;
1131 iter->count -= offset;
1132 iter->iov_offset = offset;
1133 } else {
1134 unsigned long seg_skip;
1135
1136 /* skip first vec */
1137 offset -= bvec->bv_len;
1138 seg_skip = 1 + (offset >> imu->folio_shift);
1139
1140 iter->bvec = bvec + seg_skip;
1141 iter->nr_segs -= seg_skip;
1142 iter->count -= bvec->bv_len + offset;
1143 iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
1144 }
1145 }
1146
1147 return 0;
1148 }
1149
io_clone_buffers(struct io_ring_ctx * ctx,struct io_ring_ctx * src_ctx)1150 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
1151 {
1152 struct io_mapped_ubuf **user_bufs;
1153 struct io_rsrc_data *data;
1154 int i, ret, nbufs;
1155
1156 /*
1157 * Drop our own lock here. We'll setup the data we need and reference
1158 * the source buffers, then re-grab, check, and assign at the end.
1159 */
1160 mutex_unlock(&ctx->uring_lock);
1161
1162 mutex_lock(&src_ctx->uring_lock);
1163 ret = -ENXIO;
1164 nbufs = src_ctx->nr_user_bufs;
1165 if (!nbufs)
1166 goto out_unlock;
1167 ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data);
1168 if (ret)
1169 goto out_unlock;
1170
1171 ret = -ENOMEM;
1172 user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL);
1173 if (!user_bufs)
1174 goto out_free_data;
1175
1176 for (i = 0; i < nbufs; i++) {
1177 struct io_mapped_ubuf *src = src_ctx->user_bufs[i];
1178
1179 if (src != &dummy_ubuf)
1180 refcount_inc(&src->refs);
1181 user_bufs[i] = src;
1182 }
1183
1184 /* Have a ref on the bufs now, drop src lock and re-grab our own lock */
1185 mutex_unlock(&src_ctx->uring_lock);
1186 mutex_lock(&ctx->uring_lock);
1187 if (!ctx->user_bufs) {
1188 ctx->user_bufs = user_bufs;
1189 ctx->buf_data = data;
1190 ctx->nr_user_bufs = nbufs;
1191 return 0;
1192 }
1193
1194 /* someone raced setting up buffers, dump ours */
1195 for (i = 0; i < nbufs; i++)
1196 io_buffer_unmap(ctx, &user_bufs[i]);
1197 io_rsrc_data_free(data);
1198 kfree(user_bufs);
1199 return -EBUSY;
1200 out_free_data:
1201 io_rsrc_data_free(data);
1202 out_unlock:
1203 mutex_unlock(&src_ctx->uring_lock);
1204 mutex_lock(&ctx->uring_lock);
1205 return ret;
1206 }
1207
1208 /*
1209 * Copy the registered buffers from the source ring whose file descriptor
1210 * is given in the src_fd to the current ring. This is identical to registering
1211 * the buffers with ctx, except faster as mappings already exist.
1212 *
1213 * Since the memory is already accounted once, don't account it again.
1214 */
io_register_clone_buffers(struct io_ring_ctx * ctx,void __user * arg)1215 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
1216 {
1217 struct io_uring_clone_buffers buf;
1218 bool registered_src;
1219 struct file *file;
1220 int ret;
1221
1222 if (ctx->user_bufs || ctx->nr_user_bufs)
1223 return -EBUSY;
1224 if (copy_from_user(&buf, arg, sizeof(buf)))
1225 return -EFAULT;
1226 if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED)
1227 return -EINVAL;
1228 if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
1229 return -EINVAL;
1230
1231 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
1232 file = io_uring_register_get_file(buf.src_fd, registered_src);
1233 if (IS_ERR(file))
1234 return PTR_ERR(file);
1235 ret = io_clone_buffers(ctx, file->private_data);
1236 if (!registered_src)
1237 fput(file);
1238 return ret;
1239 }
1240