xref: /linux/io_uring/rsrc.c (revision 876f5ebd58a9ac42f48a7ead3d5b274a314e0ace)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/fs.h>
5 #include <linux/file.h>
6 #include <linux/mm.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
12 #include <linux/io_uring/cmd.h>
13 
14 #include <uapi/linux/io_uring.h>
15 
16 #include "io_uring.h"
17 #include "openclose.h"
18 #include "rsrc.h"
19 #include "memmap.h"
20 #include "register.h"
21 
22 struct io_rsrc_update {
23 	struct file			*file;
24 	u64				arg;
25 	u32				nr_args;
26 	u32				offset;
27 };
28 
29 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
30 			struct iovec *iov, struct page **last_hpage);
31 
32 /* only define max */
33 #define IORING_MAX_FIXED_FILES	(1U << 20)
34 #define IORING_MAX_REG_BUFFERS	(1U << 14)
35 
36 #define IO_CACHED_BVECS_SEGS	32
37 
38 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
39 {
40 	unsigned long page_limit, cur_pages, new_pages;
41 
42 	if (!nr_pages)
43 		return 0;
44 
45 	/* Don't allow more pages than we can safely lock */
46 	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
47 
48 	cur_pages = atomic_long_read(&user->locked_vm);
49 	do {
50 		new_pages = cur_pages + nr_pages;
51 		if (new_pages > page_limit)
52 			return -ENOMEM;
53 	} while (!atomic_long_try_cmpxchg(&user->locked_vm,
54 					  &cur_pages, new_pages));
55 	return 0;
56 }
57 
58 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
59 {
60 	if (ctx->user)
61 		__io_unaccount_mem(ctx->user, nr_pages);
62 
63 	if (ctx->mm_account)
64 		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
65 }
66 
67 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
68 {
69 	int ret;
70 
71 	if (ctx->user) {
72 		ret = __io_account_mem(ctx->user, nr_pages);
73 		if (ret)
74 			return ret;
75 	}
76 
77 	if (ctx->mm_account)
78 		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
79 
80 	return 0;
81 }
82 
83 int io_validate_user_buf_range(u64 uaddr, u64 ulen)
84 {
85 	unsigned long tmp, base = (unsigned long)uaddr;
86 	unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen);
87 
88 	/* arbitrary limit, but we need something */
89 	if (ulen > SZ_1G || !ulen)
90 		return -EFAULT;
91 	if (check_add_overflow(base, acct_len, &tmp))
92 		return -EOVERFLOW;
93 	return 0;
94 }
95 
96 static int io_buffer_validate(struct iovec *iov)
97 {
98 	/*
99 	 * Don't impose further limits on the size and buffer
100 	 * constraints here, we'll -EINVAL later when IO is
101 	 * submitted if they are wrong.
102 	 */
103 	if (!iov->iov_base)
104 		return iov->iov_len ? -EFAULT : 0;
105 
106 	return io_validate_user_buf_range((unsigned long)iov->iov_base,
107 					  iov->iov_len);
108 }
109 
110 static void io_release_ubuf(void *priv)
111 {
112 	struct io_mapped_ubuf *imu = priv;
113 	unsigned int i;
114 
115 	for (i = 0; i < imu->nr_bvecs; i++)
116 		unpin_user_page(imu->bvec[i].bv_page);
117 }
118 
119 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
120 					   int nr_bvecs)
121 {
122 	if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
123 		return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL);
124 	return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
125 			GFP_KERNEL);
126 }
127 
128 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
129 {
130 	if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS)
131 		io_cache_free(&ctx->imu_cache, imu);
132 	else
133 		kvfree(imu);
134 }
135 
136 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
137 {
138 	if (!refcount_dec_and_test(&imu->refs))
139 		return;
140 
141 	if (imu->acct_pages)
142 		io_unaccount_mem(ctx, imu->acct_pages);
143 	imu->release(imu->priv);
144 	io_free_imu(ctx, imu);
145 }
146 
147 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
148 {
149 	struct io_rsrc_node *node;
150 
151 	node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL);
152 	if (node) {
153 		node->type = type;
154 		node->refs = 1;
155 		node->tag = 0;
156 		node->file_ptr = 0;
157 	}
158 	return node;
159 }
160 
161 bool io_rsrc_cache_init(struct io_ring_ctx *ctx)
162 {
163 	const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
164 						 IO_CACHED_BVECS_SEGS);
165 	const int node_size = sizeof(struct io_rsrc_node);
166 	bool ret;
167 
168 	ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX,
169 				  node_size, 0);
170 	ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX,
171 				   imu_cache_size, 0);
172 	return ret;
173 }
174 
175 void io_rsrc_cache_free(struct io_ring_ctx *ctx)
176 {
177 	io_alloc_cache_free(&ctx->node_cache, kfree);
178 	io_alloc_cache_free(&ctx->imu_cache, kfree);
179 }
180 
181 static void io_clear_table_tags(struct io_rsrc_data *data)
182 {
183 	int i;
184 
185 	for (i = 0; i < data->nr; i++) {
186 		struct io_rsrc_node *node = data->nodes[i];
187 
188 		if (node)
189 			node->tag = 0;
190 	}
191 }
192 
193 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
194 			      struct io_rsrc_data *data)
195 {
196 	if (!data->nr)
197 		return;
198 	while (data->nr--) {
199 		if (data->nodes[data->nr])
200 			io_put_rsrc_node(ctx, data->nodes[data->nr]);
201 	}
202 	kvfree(data->nodes);
203 	data->nodes = NULL;
204 	data->nr = 0;
205 }
206 
207 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
208 {
209 	data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
210 					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
211 	if (data->nodes) {
212 		data->nr = nr;
213 		return 0;
214 	}
215 	return -ENOMEM;
216 }
217 
218 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
219 				 struct io_uring_rsrc_update2 *up,
220 				 unsigned nr_args)
221 {
222 	u64 __user *tags = u64_to_user_ptr(up->tags);
223 	__s32 __user *fds = u64_to_user_ptr(up->data);
224 	int fd, i, err = 0;
225 	unsigned int done;
226 
227 	if (!ctx->file_table.data.nr)
228 		return -ENXIO;
229 	if (up->offset + nr_args > ctx->file_table.data.nr)
230 		return -EINVAL;
231 
232 	for (done = 0; done < nr_args; done++) {
233 		u64 tag = 0;
234 
235 		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
236 		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
237 			err = -EFAULT;
238 			break;
239 		}
240 		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
241 			err = -EINVAL;
242 			break;
243 		}
244 		if (fd == IORING_REGISTER_FILES_SKIP)
245 			continue;
246 
247 		i = up->offset + done;
248 		if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
249 			io_file_bitmap_clear(&ctx->file_table, i);
250 
251 		if (fd != -1) {
252 			struct file *file = fget(fd);
253 			struct io_rsrc_node *node;
254 
255 			if (!file) {
256 				err = -EBADF;
257 				break;
258 			}
259 			/*
260 			 * Don't allow io_uring instances to be registered.
261 			 */
262 			if (io_is_uring_fops(file)) {
263 				fput(file);
264 				err = -EBADF;
265 				break;
266 			}
267 			node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
268 			if (!node) {
269 				err = -ENOMEM;
270 				fput(file);
271 				break;
272 			}
273 			ctx->file_table.data.nodes[i] = node;
274 			if (tag)
275 				node->tag = tag;
276 			io_fixed_file_set(node, file);
277 			io_file_bitmap_set(&ctx->file_table, i);
278 		}
279 	}
280 	return done ? done : err;
281 }
282 
283 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
284 				   struct io_uring_rsrc_update2 *up,
285 				   unsigned int nr_args)
286 {
287 	u64 __user *tags = u64_to_user_ptr(up->tags);
288 	struct iovec fast_iov, *iov;
289 	struct page *last_hpage = NULL;
290 	struct iovec __user *uvec;
291 	u64 user_data = up->data;
292 	__u32 done;
293 	int i, err;
294 
295 	if (!ctx->buf_table.nr)
296 		return -ENXIO;
297 	if (up->offset + nr_args > ctx->buf_table.nr)
298 		return -EINVAL;
299 
300 	for (done = 0; done < nr_args; done++) {
301 		struct io_rsrc_node *node;
302 		u64 tag = 0;
303 
304 		uvec = u64_to_user_ptr(user_data);
305 		iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
306 		if (IS_ERR(iov)) {
307 			err = PTR_ERR(iov);
308 			break;
309 		}
310 		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
311 			err = -EFAULT;
312 			break;
313 		}
314 		err = io_buffer_validate(iov);
315 		if (err)
316 			break;
317 		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
318 		if (IS_ERR(node)) {
319 			err = PTR_ERR(node);
320 			break;
321 		}
322 		if (tag) {
323 			if (!node) {
324 				err = -EINVAL;
325 				break;
326 			}
327 			node->tag = tag;
328 		}
329 		i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
330 		io_reset_rsrc_node(ctx, &ctx->buf_table, i);
331 		ctx->buf_table.nodes[i] = node;
332 		if (ctx->compat)
333 			user_data += sizeof(struct compat_iovec);
334 		else
335 			user_data += sizeof(struct iovec);
336 	}
337 	return done ? done : err;
338 }
339 
340 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
341 				     struct io_uring_rsrc_update2 *up,
342 				     unsigned nr_args)
343 {
344 	__u32 tmp;
345 
346 	lockdep_assert_held(&ctx->uring_lock);
347 
348 	if (check_add_overflow(up->offset, nr_args, &tmp))
349 		return -EOVERFLOW;
350 
351 	switch (type) {
352 	case IORING_RSRC_FILE:
353 		return __io_sqe_files_update(ctx, up, nr_args);
354 	case IORING_RSRC_BUFFER:
355 		return __io_sqe_buffers_update(ctx, up, nr_args);
356 	}
357 	return -EINVAL;
358 }
359 
360 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
361 			     unsigned nr_args)
362 {
363 	struct io_uring_rsrc_update2 up;
364 
365 	if (!nr_args)
366 		return -EINVAL;
367 	memset(&up, 0, sizeof(up));
368 	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
369 		return -EFAULT;
370 	if (up.resv || up.resv2)
371 		return -EINVAL;
372 	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
373 }
374 
375 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
376 			    unsigned size, unsigned type)
377 {
378 	struct io_uring_rsrc_update2 up;
379 
380 	if (size != sizeof(up))
381 		return -EINVAL;
382 	if (copy_from_user(&up, arg, sizeof(up)))
383 		return -EFAULT;
384 	if (!up.nr || up.resv || up.resv2)
385 		return -EINVAL;
386 	return __io_register_rsrc_update(ctx, type, &up, up.nr);
387 }
388 
389 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
390 			    unsigned int size, unsigned int type)
391 {
392 	struct io_uring_rsrc_register rr;
393 
394 	/* keep it extendible */
395 	if (size != sizeof(rr))
396 		return -EINVAL;
397 
398 	memset(&rr, 0, sizeof(rr));
399 	if (copy_from_user(&rr, arg, size))
400 		return -EFAULT;
401 	if (!rr.nr || rr.resv2)
402 		return -EINVAL;
403 	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
404 		return -EINVAL;
405 
406 	switch (type) {
407 	case IORING_RSRC_FILE:
408 		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
409 			break;
410 		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
411 					     rr.nr, u64_to_user_ptr(rr.tags));
412 	case IORING_RSRC_BUFFER:
413 		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
414 			break;
415 		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
416 					       rr.nr, u64_to_user_ptr(rr.tags));
417 	}
418 	return -EINVAL;
419 }
420 
421 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
422 {
423 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
424 
425 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
426 		return -EINVAL;
427 	if (sqe->rw_flags || sqe->splice_fd_in)
428 		return -EINVAL;
429 
430 	up->offset = READ_ONCE(sqe->off);
431 	up->nr_args = READ_ONCE(sqe->len);
432 	if (!up->nr_args)
433 		return -EINVAL;
434 	up->arg = READ_ONCE(sqe->addr);
435 	return 0;
436 }
437 
438 static int io_files_update_with_index_alloc(struct io_kiocb *req,
439 					    unsigned int issue_flags)
440 {
441 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
442 	__s32 __user *fds = u64_to_user_ptr(up->arg);
443 	unsigned int done;
444 	struct file *file;
445 	int ret, fd;
446 
447 	if (!req->ctx->file_table.data.nr)
448 		return -ENXIO;
449 
450 	for (done = 0; done < up->nr_args; done++) {
451 		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
452 			ret = -EFAULT;
453 			break;
454 		}
455 
456 		file = fget(fd);
457 		if (!file) {
458 			ret = -EBADF;
459 			break;
460 		}
461 		ret = io_fixed_fd_install(req, issue_flags, file,
462 					  IORING_FILE_INDEX_ALLOC);
463 		if (ret < 0)
464 			break;
465 		if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
466 			__io_close_fixed(req->ctx, issue_flags, ret);
467 			ret = -EFAULT;
468 			break;
469 		}
470 	}
471 
472 	if (done)
473 		return done;
474 	return ret;
475 }
476 
477 int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
478 {
479 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
480 	struct io_ring_ctx *ctx = req->ctx;
481 	struct io_uring_rsrc_update2 up2;
482 	int ret;
483 
484 	up2.offset = up->offset;
485 	up2.data = up->arg;
486 	up2.nr = 0;
487 	up2.tags = 0;
488 	up2.resv = 0;
489 	up2.resv2 = 0;
490 
491 	if (up->offset == IORING_FILE_INDEX_ALLOC) {
492 		ret = io_files_update_with_index_alloc(req, issue_flags);
493 	} else {
494 		io_ring_submit_lock(ctx, issue_flags);
495 		ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
496 						&up2, up->nr_args);
497 		io_ring_submit_unlock(ctx, issue_flags);
498 	}
499 
500 	if (ret < 0)
501 		req_set_fail(req);
502 	io_req_set_res(req, ret, 0);
503 	return IOU_COMPLETE;
504 }
505 
506 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
507 {
508 	if (node->tag)
509 		io_post_aux_cqe(ctx, node->tag, 0, 0);
510 
511 	switch (node->type) {
512 	case IORING_RSRC_FILE:
513 		fput(io_slot_file(node));
514 		break;
515 	case IORING_RSRC_BUFFER:
516 		io_buffer_unmap(ctx, node->buf);
517 		break;
518 	default:
519 		WARN_ON_ONCE(1);
520 		break;
521 	}
522 
523 	io_cache_free(&ctx->node_cache, node);
524 }
525 
526 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
527 {
528 	if (!ctx->file_table.data.nr)
529 		return -ENXIO;
530 
531 	io_free_file_tables(ctx, &ctx->file_table);
532 	io_file_table_set_alloc_range(ctx, 0, 0);
533 	return 0;
534 }
535 
536 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
537 			  unsigned nr_args, u64 __user *tags)
538 {
539 	__s32 __user *fds = (__s32 __user *) arg;
540 	struct file *file;
541 	int fd, ret;
542 	unsigned i;
543 
544 	if (ctx->file_table.data.nr)
545 		return -EBUSY;
546 	if (!nr_args)
547 		return -EINVAL;
548 	if (nr_args > IORING_MAX_FIXED_FILES)
549 		return -EMFILE;
550 	if (nr_args > rlimit(RLIMIT_NOFILE))
551 		return -EMFILE;
552 	if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args))
553 		return -ENOMEM;
554 
555 	for (i = 0; i < nr_args; i++) {
556 		struct io_rsrc_node *node;
557 		u64 tag = 0;
558 
559 		ret = -EFAULT;
560 		if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
561 			goto fail;
562 		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
563 			goto fail;
564 		/* allow sparse sets */
565 		if (!fds || fd == -1) {
566 			ret = -EINVAL;
567 			if (tag)
568 				goto fail;
569 			continue;
570 		}
571 
572 		file = fget(fd);
573 		ret = -EBADF;
574 		if (unlikely(!file))
575 			goto fail;
576 
577 		/*
578 		 * Don't allow io_uring instances to be registered.
579 		 */
580 		if (io_is_uring_fops(file)) {
581 			fput(file);
582 			goto fail;
583 		}
584 		ret = -ENOMEM;
585 		node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
586 		if (!node) {
587 			fput(file);
588 			goto fail;
589 		}
590 		if (tag)
591 			node->tag = tag;
592 		ctx->file_table.data.nodes[i] = node;
593 		io_fixed_file_set(node, file);
594 		io_file_bitmap_set(&ctx->file_table, i);
595 	}
596 
597 	/* default it to the whole table */
598 	io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
599 	return 0;
600 fail:
601 	io_clear_table_tags(&ctx->file_table.data);
602 	io_sqe_files_unregister(ctx);
603 	return ret;
604 }
605 
606 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
607 {
608 	if (!ctx->buf_table.nr)
609 		return -ENXIO;
610 	io_rsrc_data_free(ctx, &ctx->buf_table);
611 	return 0;
612 }
613 
614 /*
615  * Not super efficient, but this is just a registration time. And we do cache
616  * the last compound head, so generally we'll only do a full search if we don't
617  * match that one.
618  *
619  * We check if the given compound head page has already been accounted, to
620  * avoid double accounting it. This allows us to account the full size of the
621  * page, not just the constituent pages of a huge page.
622  */
623 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
624 				  int nr_pages, struct page *hpage)
625 {
626 	int i, j;
627 
628 	/* check current page array */
629 	for (i = 0; i < nr_pages; i++) {
630 		if (!PageCompound(pages[i]))
631 			continue;
632 		if (compound_head(pages[i]) == hpage)
633 			return true;
634 	}
635 
636 	/* check previously registered pages */
637 	for (i = 0; i < ctx->buf_table.nr; i++) {
638 		struct io_rsrc_node *node = ctx->buf_table.nodes[i];
639 		struct io_mapped_ubuf *imu;
640 
641 		if (!node)
642 			continue;
643 		imu = node->buf;
644 		for (j = 0; j < imu->nr_bvecs; j++) {
645 			if (!PageCompound(imu->bvec[j].bv_page))
646 				continue;
647 			if (compound_head(imu->bvec[j].bv_page) == hpage)
648 				return true;
649 		}
650 	}
651 
652 	return false;
653 }
654 
655 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
656 				 int nr_pages, struct io_mapped_ubuf *imu,
657 				 struct page **last_hpage)
658 {
659 	int i, ret;
660 
661 	imu->acct_pages = 0;
662 	for (i = 0; i < nr_pages; i++) {
663 		if (!PageCompound(pages[i])) {
664 			imu->acct_pages++;
665 		} else {
666 			struct page *hpage;
667 
668 			hpage = compound_head(pages[i]);
669 			if (hpage == *last_hpage)
670 				continue;
671 			*last_hpage = hpage;
672 			if (headpage_already_acct(ctx, pages, i, hpage))
673 				continue;
674 			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
675 		}
676 	}
677 
678 	if (!imu->acct_pages)
679 		return 0;
680 
681 	ret = io_account_mem(ctx, imu->acct_pages);
682 	if (ret)
683 		imu->acct_pages = 0;
684 	return ret;
685 }
686 
687 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
688 				struct io_imu_folio_data *data)
689 {
690 	struct page **page_array = *pages, **new_array = NULL;
691 	unsigned nr_pages_left = *nr_pages;
692 	unsigned nr_folios = data->nr_folios;
693 	unsigned i, j;
694 
695 	/* Store head pages only*/
696 	new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL);
697 	if (!new_array)
698 		return false;
699 
700 	for (i = 0, j = 0; i < nr_folios; i++) {
701 		struct page *p = compound_head(page_array[j]);
702 		struct folio *folio = page_folio(p);
703 		unsigned int nr;
704 
705 		WARN_ON_ONCE(i > 0 && p != page_array[j]);
706 
707 		nr = i ? data->nr_pages_mid : data->nr_pages_head;
708 		nr = min(nr, nr_pages_left);
709 		/* Drop all but one ref, the entire folio will remain pinned. */
710 		if (nr > 1)
711 			unpin_user_folio(folio, nr - 1);
712 		j += nr;
713 		nr_pages_left -= nr;
714 		new_array[i] = p;
715 	}
716 
717 	WARN_ON_ONCE(j != *nr_pages);
718 
719 	kvfree(page_array);
720 	*pages = new_array;
721 	*nr_pages = nr_folios;
722 	return true;
723 }
724 
725 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
726 			      struct io_imu_folio_data *data)
727 {
728 	struct folio *folio = page_folio(page_array[0]);
729 	unsigned int count = 1, nr_folios = 1;
730 	int i;
731 
732 	data->nr_pages_mid = folio_nr_pages(folio);
733 	data->folio_shift = folio_shift(folio);
734 
735 	/*
736 	 * Check if pages are contiguous inside a folio, and all folios have
737 	 * the same page count except for the head and tail.
738 	 */
739 	for (i = 1; i < nr_pages; i++) {
740 		if (page_folio(page_array[i]) == folio &&
741 			page_array[i] == page_array[i-1] + 1) {
742 			count++;
743 			continue;
744 		}
745 
746 		if (nr_folios == 1) {
747 			if (folio_page_idx(folio, page_array[i-1]) !=
748 				data->nr_pages_mid - 1)
749 				return false;
750 
751 			data->nr_pages_head = count;
752 		} else if (count != data->nr_pages_mid) {
753 			return false;
754 		}
755 
756 		folio = page_folio(page_array[i]);
757 		if (folio_size(folio) != (1UL << data->folio_shift) ||
758 			folio_page_idx(folio, page_array[i]) != 0)
759 			return false;
760 
761 		count = 1;
762 		nr_folios++;
763 	}
764 	if (nr_folios == 1)
765 		data->nr_pages_head = count;
766 
767 	data->nr_folios = nr_folios;
768 	return true;
769 }
770 
771 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
772 						   struct iovec *iov,
773 						   struct page **last_hpage)
774 {
775 	struct io_mapped_ubuf *imu = NULL;
776 	struct page **pages = NULL;
777 	struct io_rsrc_node *node;
778 	unsigned long off;
779 	size_t size;
780 	int ret, nr_pages, i;
781 	struct io_imu_folio_data data;
782 	bool coalesced = false;
783 
784 	if (!iov->iov_base)
785 		return NULL;
786 
787 	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
788 	if (!node)
789 		return ERR_PTR(-ENOMEM);
790 
791 	ret = -ENOMEM;
792 	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
793 				&nr_pages);
794 	if (IS_ERR(pages)) {
795 		ret = PTR_ERR(pages);
796 		pages = NULL;
797 		goto done;
798 	}
799 
800 	/* If it's huge page(s), try to coalesce them into fewer bvec entries */
801 	if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) {
802 		if (data.nr_pages_mid != 1)
803 			coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
804 	}
805 
806 	imu = io_alloc_imu(ctx, nr_pages);
807 	if (!imu)
808 		goto done;
809 
810 	imu->nr_bvecs = nr_pages;
811 	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
812 	if (ret)
813 		goto done;
814 
815 	size = iov->iov_len;
816 	/* store original address for later verification */
817 	imu->ubuf = (unsigned long) iov->iov_base;
818 	imu->len = iov->iov_len;
819 	imu->folio_shift = PAGE_SHIFT;
820 	imu->release = io_release_ubuf;
821 	imu->priv = imu;
822 	imu->is_kbuf = false;
823 	imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
824 	if (coalesced)
825 		imu->folio_shift = data.folio_shift;
826 	refcount_set(&imu->refs, 1);
827 	off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
828 	node->buf = imu;
829 	ret = 0;
830 
831 	for (i = 0; i < nr_pages; i++) {
832 		size_t vec_len;
833 
834 		vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
835 		bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
836 		off = 0;
837 		size -= vec_len;
838 	}
839 done:
840 	if (ret) {
841 		if (imu)
842 			io_free_imu(ctx, imu);
843 		if (pages)
844 			unpin_user_pages(pages, nr_pages);
845 		io_cache_free(&ctx->node_cache, node);
846 		node = ERR_PTR(ret);
847 	}
848 	kvfree(pages);
849 	return node;
850 }
851 
852 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
853 			    unsigned int nr_args, u64 __user *tags)
854 {
855 	struct page *last_hpage = NULL;
856 	struct io_rsrc_data data;
857 	struct iovec fast_iov, *iov = &fast_iov;
858 	const struct iovec __user *uvec;
859 	int i, ret;
860 
861 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
862 
863 	if (ctx->buf_table.nr)
864 		return -EBUSY;
865 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
866 		return -EINVAL;
867 	ret = io_rsrc_data_alloc(&data, nr_args);
868 	if (ret)
869 		return ret;
870 
871 	if (!arg)
872 		memset(iov, 0, sizeof(*iov));
873 
874 	for (i = 0; i < nr_args; i++) {
875 		struct io_rsrc_node *node;
876 		u64 tag = 0;
877 
878 		if (arg) {
879 			uvec = (struct iovec __user *) arg;
880 			iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
881 			if (IS_ERR(iov)) {
882 				ret = PTR_ERR(iov);
883 				break;
884 			}
885 			ret = io_buffer_validate(iov);
886 			if (ret)
887 				break;
888 			if (ctx->compat)
889 				arg += sizeof(struct compat_iovec);
890 			else
891 				arg += sizeof(struct iovec);
892 		}
893 
894 		if (tags) {
895 			if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
896 				ret = -EFAULT;
897 				break;
898 			}
899 		}
900 
901 		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
902 		if (IS_ERR(node)) {
903 			ret = PTR_ERR(node);
904 			break;
905 		}
906 		if (tag) {
907 			if (!node) {
908 				ret = -EINVAL;
909 				break;
910 			}
911 			node->tag = tag;
912 		}
913 		data.nodes[i] = node;
914 	}
915 
916 	ctx->buf_table = data;
917 	if (ret) {
918 		io_clear_table_tags(&ctx->buf_table);
919 		io_sqe_buffers_unregister(ctx);
920 	}
921 	return ret;
922 }
923 
924 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
925 			    void (*release)(void *), unsigned int index,
926 			    unsigned int issue_flags)
927 {
928 	struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
929 	struct io_rsrc_data *data = &ctx->buf_table;
930 	struct req_iterator rq_iter;
931 	struct io_mapped_ubuf *imu;
932 	struct io_rsrc_node *node;
933 	struct bio_vec bv, *bvec;
934 	u16 nr_bvecs;
935 	int ret = 0;
936 
937 	io_ring_submit_lock(ctx, issue_flags);
938 	if (index >= data->nr) {
939 		ret = -EINVAL;
940 		goto unlock;
941 	}
942 	index = array_index_nospec(index, data->nr);
943 
944 	if (data->nodes[index]) {
945 		ret = -EBUSY;
946 		goto unlock;
947 	}
948 
949 	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
950 	if (!node) {
951 		ret = -ENOMEM;
952 		goto unlock;
953 	}
954 
955 	nr_bvecs = blk_rq_nr_phys_segments(rq);
956 	imu = io_alloc_imu(ctx, nr_bvecs);
957 	if (!imu) {
958 		kfree(node);
959 		ret = -ENOMEM;
960 		goto unlock;
961 	}
962 
963 	imu->ubuf = 0;
964 	imu->len = blk_rq_bytes(rq);
965 	imu->acct_pages = 0;
966 	imu->folio_shift = PAGE_SHIFT;
967 	imu->nr_bvecs = nr_bvecs;
968 	refcount_set(&imu->refs, 1);
969 	imu->release = release;
970 	imu->priv = rq;
971 	imu->is_kbuf = true;
972 	imu->dir = 1 << rq_data_dir(rq);
973 
974 	bvec = imu->bvec;
975 	rq_for_each_bvec(bv, rq, rq_iter)
976 		*bvec++ = bv;
977 
978 	node->buf = imu;
979 	data->nodes[index] = node;
980 unlock:
981 	io_ring_submit_unlock(ctx, issue_flags);
982 	return ret;
983 }
984 EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
985 
986 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
987 			      unsigned int issue_flags)
988 {
989 	struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
990 	struct io_rsrc_data *data = &ctx->buf_table;
991 	struct io_rsrc_node *node;
992 	int ret = 0;
993 
994 	io_ring_submit_lock(ctx, issue_flags);
995 	if (index >= data->nr) {
996 		ret = -EINVAL;
997 		goto unlock;
998 	}
999 	index = array_index_nospec(index, data->nr);
1000 
1001 	node = data->nodes[index];
1002 	if (!node) {
1003 		ret = -EINVAL;
1004 		goto unlock;
1005 	}
1006 	if (!node->buf->is_kbuf) {
1007 		ret = -EBUSY;
1008 		goto unlock;
1009 	}
1010 
1011 	io_put_rsrc_node(ctx, node);
1012 	data->nodes[index] = NULL;
1013 unlock:
1014 	io_ring_submit_unlock(ctx, issue_flags);
1015 	return ret;
1016 }
1017 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
1018 
1019 static int validate_fixed_range(u64 buf_addr, size_t len,
1020 				const struct io_mapped_ubuf *imu)
1021 {
1022 	u64 buf_end;
1023 
1024 	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1025 		return -EFAULT;
1026 	/* not inside the mapped region */
1027 	if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
1028 		return -EFAULT;
1029 	if (unlikely(len > MAX_RW_COUNT))
1030 		return -EFAULT;
1031 	return 0;
1032 }
1033 
1034 static int io_import_kbuf(int ddir, struct iov_iter *iter,
1035 			  struct io_mapped_ubuf *imu, size_t len, size_t offset)
1036 {
1037 	size_t count = len + offset;
1038 
1039 	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
1040 	iov_iter_advance(iter, offset);
1041 
1042 	if (count < imu->len) {
1043 		const struct bio_vec *bvec = iter->bvec;
1044 
1045 		while (len > bvec->bv_len) {
1046 			len -= bvec->bv_len;
1047 			bvec++;
1048 		}
1049 		iter->nr_segs = 1 + bvec - iter->bvec;
1050 	}
1051 	return 0;
1052 }
1053 
1054 static int io_import_fixed(int ddir, struct iov_iter *iter,
1055 			   struct io_mapped_ubuf *imu,
1056 			   u64 buf_addr, size_t len)
1057 {
1058 	const struct bio_vec *bvec;
1059 	size_t folio_mask;
1060 	unsigned nr_segs;
1061 	size_t offset;
1062 	int ret;
1063 
1064 	ret = validate_fixed_range(buf_addr, len, imu);
1065 	if (unlikely(ret))
1066 		return ret;
1067 	if (!(imu->dir & (1 << ddir)))
1068 		return -EFAULT;
1069 
1070 	offset = buf_addr - imu->ubuf;
1071 
1072 	if (imu->is_kbuf)
1073 		return io_import_kbuf(ddir, iter, imu, len, offset);
1074 
1075 	/*
1076 	 * Don't use iov_iter_advance() here, as it's really slow for
1077 	 * using the latter parts of a big fixed buffer - it iterates
1078 	 * over each segment manually. We can cheat a bit here for user
1079 	 * registered nodes, because we know that:
1080 	 *
1081 	 * 1) it's a BVEC iter, we set it up
1082 	 * 2) all bvecs are the same in size, except potentially the
1083 	 *    first and last bvec
1084 	 */
1085 	folio_mask = (1UL << imu->folio_shift) - 1;
1086 	bvec = imu->bvec;
1087 	if (offset >= bvec->bv_len) {
1088 		unsigned long seg_skip;
1089 
1090 		/* skip first vec */
1091 		offset -= bvec->bv_len;
1092 		seg_skip = 1 + (offset >> imu->folio_shift);
1093 		bvec += seg_skip;
1094 		offset &= folio_mask;
1095 	}
1096 	nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
1097 	iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
1098 	iter->iov_offset = offset;
1099 	return 0;
1100 }
1101 
1102 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
1103 					     unsigned issue_flags)
1104 {
1105 	struct io_ring_ctx *ctx = req->ctx;
1106 	struct io_rsrc_node *node;
1107 
1108 	if (req->flags & REQ_F_BUF_NODE)
1109 		return req->buf_node;
1110 	req->flags |= REQ_F_BUF_NODE;
1111 
1112 	io_ring_submit_lock(ctx, issue_flags);
1113 	node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
1114 	if (node) {
1115 		node->refs++;
1116 		req->buf_node = node;
1117 		io_ring_submit_unlock(ctx, issue_flags);
1118 		return node;
1119 	}
1120 	req->flags &= ~REQ_F_BUF_NODE;
1121 	io_ring_submit_unlock(ctx, issue_flags);
1122 	return NULL;
1123 }
1124 
1125 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
1126 			u64 buf_addr, size_t len, int ddir,
1127 			unsigned issue_flags)
1128 {
1129 	struct io_rsrc_node *node;
1130 
1131 	node = io_find_buf_node(req, issue_flags);
1132 	if (!node)
1133 		return -EFAULT;
1134 	return io_import_fixed(ddir, iter, node->buf, buf_addr, len);
1135 }
1136 
1137 /* Lock two rings at once. The rings must be different! */
1138 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
1139 {
1140 	if (ctx1 > ctx2)
1141 		swap(ctx1, ctx2);
1142 	mutex_lock(&ctx1->uring_lock);
1143 	mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING);
1144 }
1145 
1146 /* Both rings are locked by the caller. */
1147 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
1148 			    struct io_uring_clone_buffers *arg)
1149 {
1150 	struct io_rsrc_data data;
1151 	int i, ret, off, nr;
1152 	unsigned int nbufs;
1153 
1154 	lockdep_assert_held(&ctx->uring_lock);
1155 	lockdep_assert_held(&src_ctx->uring_lock);
1156 
1157 	/*
1158 	 * Accounting state is shared between the two rings; that only works if
1159 	 * both rings are accounted towards the same counters.
1160 	 */
1161 	if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
1162 		return -EINVAL;
1163 
1164 	/* if offsets are given, must have nr specified too */
1165 	if (!arg->nr && (arg->dst_off || arg->src_off))
1166 		return -EINVAL;
1167 	/* not allowed unless REPLACE is set */
1168 	if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
1169 		return -EBUSY;
1170 
1171 	nbufs = src_ctx->buf_table.nr;
1172 	if (!arg->nr)
1173 		arg->nr = nbufs;
1174 	else if (arg->nr > nbufs)
1175 		return -EINVAL;
1176 	else if (arg->nr > IORING_MAX_REG_BUFFERS)
1177 		return -EINVAL;
1178 	if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
1179 		return -EOVERFLOW;
1180 	if (nbufs > IORING_MAX_REG_BUFFERS)
1181 		return -EINVAL;
1182 
1183 	ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
1184 	if (ret)
1185 		return ret;
1186 
1187 	/* Fill entries in data from dst that won't overlap with src */
1188 	for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
1189 		struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
1190 
1191 		if (src_node) {
1192 			data.nodes[i] = src_node;
1193 			src_node->refs++;
1194 		}
1195 	}
1196 
1197 	ret = -ENXIO;
1198 	nbufs = src_ctx->buf_table.nr;
1199 	if (!nbufs)
1200 		goto out_free;
1201 	ret = -EINVAL;
1202 	if (!arg->nr)
1203 		arg->nr = nbufs;
1204 	else if (arg->nr > nbufs)
1205 		goto out_free;
1206 	ret = -EOVERFLOW;
1207 	if (check_add_overflow(arg->nr, arg->src_off, &off))
1208 		goto out_free;
1209 	if (off > nbufs)
1210 		goto out_free;
1211 
1212 	off = arg->dst_off;
1213 	i = arg->src_off;
1214 	nr = arg->nr;
1215 	while (nr--) {
1216 		struct io_rsrc_node *dst_node, *src_node;
1217 
1218 		src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
1219 		if (!src_node) {
1220 			dst_node = NULL;
1221 		} else {
1222 			dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
1223 			if (!dst_node) {
1224 				ret = -ENOMEM;
1225 				goto out_free;
1226 			}
1227 
1228 			refcount_inc(&src_node->buf->refs);
1229 			dst_node->buf = src_node->buf;
1230 		}
1231 		data.nodes[off++] = dst_node;
1232 		i++;
1233 	}
1234 
1235 	/*
1236 	 * If asked for replace, put the old table. data->nodes[] holds both
1237 	 * old and new nodes at this point.
1238 	 */
1239 	if (arg->flags & IORING_REGISTER_DST_REPLACE)
1240 		io_rsrc_data_free(ctx, &ctx->buf_table);
1241 
1242 	/*
1243 	 * ctx->buf_table must be empty now - either the contents are being
1244 	 * replaced and we just freed the table, or the contents are being
1245 	 * copied to a ring that does not have buffers yet (checked at function
1246 	 * entry).
1247 	 */
1248 	WARN_ON_ONCE(ctx->buf_table.nr);
1249 	ctx->buf_table = data;
1250 	return 0;
1251 
1252 out_free:
1253 	io_rsrc_data_free(ctx, &data);
1254 	return ret;
1255 }
1256 
1257 /*
1258  * Copy the registered buffers from the source ring whose file descriptor
1259  * is given in the src_fd to the current ring. This is identical to registering
1260  * the buffers with ctx, except faster as mappings already exist.
1261  *
1262  * Since the memory is already accounted once, don't account it again.
1263  */
1264 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
1265 {
1266 	struct io_uring_clone_buffers buf;
1267 	struct io_ring_ctx *src_ctx;
1268 	bool registered_src;
1269 	struct file *file;
1270 	int ret;
1271 
1272 	if (copy_from_user(&buf, arg, sizeof(buf)))
1273 		return -EFAULT;
1274 	if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
1275 		return -EINVAL;
1276 	if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
1277 		return -EBUSY;
1278 	if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
1279 		return -EINVAL;
1280 
1281 	registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
1282 	file = io_uring_register_get_file(buf.src_fd, registered_src);
1283 	if (IS_ERR(file))
1284 		return PTR_ERR(file);
1285 
1286 	src_ctx = file->private_data;
1287 	if (src_ctx != ctx) {
1288 		mutex_unlock(&ctx->uring_lock);
1289 		lock_two_rings(ctx, src_ctx);
1290 	}
1291 
1292 	ret = io_clone_buffers(ctx, src_ctx, &buf);
1293 
1294 	if (src_ctx != ctx)
1295 		mutex_unlock(&src_ctx->uring_lock);
1296 
1297 	fput(file);
1298 	return ret;
1299 }
1300 
1301 void io_vec_free(struct iou_vec *iv)
1302 {
1303 	if (!iv->iovec)
1304 		return;
1305 	kfree(iv->iovec);
1306 	iv->iovec = NULL;
1307 	iv->nr = 0;
1308 }
1309 
1310 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries)
1311 {
1312 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1313 	struct iovec *iov;
1314 
1315 	iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp);
1316 	if (!iov)
1317 		return -ENOMEM;
1318 
1319 	io_vec_free(iv);
1320 	iv->iovec = iov;
1321 	iv->nr = nr_entries;
1322 	return 0;
1323 }
1324 
1325 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
1326 				struct io_mapped_ubuf *imu,
1327 				struct iovec *iovec, unsigned nr_iovs,
1328 				struct iou_vec *vec)
1329 {
1330 	unsigned long folio_size = 1 << imu->folio_shift;
1331 	unsigned long folio_mask = folio_size - 1;
1332 	u64 folio_addr = imu->ubuf & ~folio_mask;
1333 	struct bio_vec *res_bvec = vec->bvec;
1334 	size_t total_len = 0;
1335 	unsigned bvec_idx = 0;
1336 	unsigned iov_idx;
1337 
1338 	for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
1339 		size_t iov_len = iovec[iov_idx].iov_len;
1340 		u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base;
1341 		struct bio_vec *src_bvec;
1342 		size_t offset;
1343 		int ret;
1344 
1345 		ret = validate_fixed_range(buf_addr, iov_len, imu);
1346 		if (unlikely(ret))
1347 			return ret;
1348 
1349 		if (unlikely(!iov_len))
1350 			return -EFAULT;
1351 		if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
1352 			return -EOVERFLOW;
1353 
1354 		/* by using folio address it also accounts for bvec offset */
1355 		offset = buf_addr - folio_addr;
1356 		src_bvec = imu->bvec + (offset >> imu->folio_shift);
1357 		offset &= folio_mask;
1358 
1359 		for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
1360 			size_t seg_size = min_t(size_t, iov_len,
1361 						folio_size - offset);
1362 
1363 			bvec_set_page(&res_bvec[bvec_idx],
1364 				      src_bvec->bv_page, seg_size, offset);
1365 			iov_len -= seg_size;
1366 		}
1367 	}
1368 	if (total_len > MAX_RW_COUNT)
1369 		return -EINVAL;
1370 
1371 	iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len);
1372 	return 0;
1373 }
1374 
1375 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs,
1376 				 struct io_mapped_ubuf *imu)
1377 {
1378 	unsigned shift = imu->folio_shift;
1379 	size_t max_segs = 0;
1380 	unsigned i;
1381 
1382 	for (i = 0; i < nr_iovs; i++)
1383 		max_segs += (iov[i].iov_len >> shift) + 2;
1384 	return max_segs;
1385 }
1386 
1387 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter,
1388 				 struct io_mapped_ubuf *imu,
1389 				 struct iovec *iovec, unsigned nr_iovs,
1390 				 struct iou_vec *vec)
1391 {
1392 	const struct bio_vec *src_bvec = imu->bvec;
1393 	struct bio_vec *res_bvec = vec->bvec;
1394 	unsigned res_idx = 0;
1395 	size_t total_len = 0;
1396 	unsigned iov_idx;
1397 
1398 	for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
1399 		size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base;
1400 		size_t iov_len = iovec[iov_idx].iov_len;
1401 		struct bvec_iter bi = {
1402 			.bi_size        = offset + iov_len,
1403 		};
1404 		struct bio_vec bv;
1405 
1406 		bvec_iter_advance(src_bvec, &bi, offset);
1407 		for_each_mp_bvec(bv, src_bvec, bi, bi)
1408 			res_bvec[res_idx++] = bv;
1409 		total_len += iov_len;
1410 	}
1411 	iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len);
1412 	return 0;
1413 }
1414 
1415 static int iov_kern_bvec_size(const struct iovec *iov,
1416 			      const struct io_mapped_ubuf *imu,
1417 			      unsigned int *nr_seg)
1418 {
1419 	size_t offset = (size_t)(uintptr_t)iov->iov_base;
1420 	const struct bio_vec *bvec = imu->bvec;
1421 	int start = 0, i = 0;
1422 	size_t off = 0;
1423 	int ret;
1424 
1425 	ret = validate_fixed_range(offset, iov->iov_len, imu);
1426 	if (unlikely(ret))
1427 		return ret;
1428 
1429 	for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs;
1430 			off += bvec[i].bv_len, i++) {
1431 		if (offset >= off && offset < off + bvec[i].bv_len)
1432 			start = i;
1433 	}
1434 	*nr_seg = i - start;
1435 	return 0;
1436 }
1437 
1438 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs,
1439 			     struct io_mapped_ubuf *imu, unsigned *nr_segs)
1440 {
1441 	unsigned max_segs = 0;
1442 	size_t total_len = 0;
1443 	unsigned i;
1444 	int ret;
1445 
1446 	*nr_segs = 0;
1447 	for (i = 0; i < nr_iovs; i++) {
1448 		if (unlikely(!iov[i].iov_len))
1449 			return -EFAULT;
1450 		if (unlikely(check_add_overflow(total_len, iov[i].iov_len,
1451 						&total_len)))
1452 			return -EOVERFLOW;
1453 		ret = iov_kern_bvec_size(&iov[i], imu, &max_segs);
1454 		if (unlikely(ret))
1455 			return ret;
1456 		*nr_segs += max_segs;
1457 	}
1458 	if (total_len > MAX_RW_COUNT)
1459 		return -EINVAL;
1460 	return 0;
1461 }
1462 
1463 int io_import_reg_vec(int ddir, struct iov_iter *iter,
1464 			struct io_kiocb *req, struct iou_vec *vec,
1465 			unsigned nr_iovs, unsigned issue_flags)
1466 {
1467 	struct io_rsrc_node *node;
1468 	struct io_mapped_ubuf *imu;
1469 	unsigned iovec_off;
1470 	struct iovec *iov;
1471 	unsigned nr_segs;
1472 
1473 	node = io_find_buf_node(req, issue_flags);
1474 	if (!node)
1475 		return -EFAULT;
1476 	imu = node->buf;
1477 	if (!(imu->dir & (1 << ddir)))
1478 		return -EFAULT;
1479 
1480 	iovec_off = vec->nr - nr_iovs;
1481 	iov = vec->iovec + iovec_off;
1482 
1483 	if (imu->is_kbuf) {
1484 		int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs);
1485 
1486 		if (unlikely(ret))
1487 			return ret;
1488 	} else {
1489 		nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu);
1490 	}
1491 
1492 	if (sizeof(struct bio_vec) > sizeof(struct iovec)) {
1493 		size_t bvec_bytes;
1494 
1495 		bvec_bytes = nr_segs * sizeof(struct bio_vec);
1496 		nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov);
1497 		nr_segs += nr_iovs;
1498 	}
1499 
1500 	if (nr_segs > vec->nr) {
1501 		struct iou_vec tmp_vec = {};
1502 		int ret;
1503 
1504 		ret = io_vec_realloc(&tmp_vec, nr_segs);
1505 		if (ret)
1506 			return ret;
1507 
1508 		iovec_off = tmp_vec.nr - nr_iovs;
1509 		memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
1510 		io_vec_free(vec);
1511 
1512 		*vec = tmp_vec;
1513 		iov = vec->iovec + iovec_off;
1514 		req->flags |= REQ_F_NEED_CLEANUP;
1515 	}
1516 
1517 	if (imu->is_kbuf)
1518 		return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec);
1519 
1520 	return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
1521 }
1522 
1523 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv,
1524 		      const struct iovec __user *uvec, size_t uvec_segs)
1525 {
1526 	struct iovec *iov;
1527 	int iovec_off, ret;
1528 	void *res;
1529 
1530 	if (uvec_segs > iv->nr) {
1531 		ret = io_vec_realloc(iv, uvec_segs);
1532 		if (ret)
1533 			return ret;
1534 		req->flags |= REQ_F_NEED_CLEANUP;
1535 	}
1536 
1537 	/* pad iovec to the right */
1538 	iovec_off = iv->nr - uvec_segs;
1539 	iov = iv->iovec + iovec_off;
1540 	res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
1541 			      io_is_compat(req->ctx));
1542 	if (IS_ERR(res))
1543 		return PTR_ERR(res);
1544 
1545 	req->flags |= REQ_F_IMPORT_BUFFER;
1546 	return 0;
1547 }
1548