xref: /linux/io_uring/rsrc.c (revision ca220141fa8ebae09765a242076b2b77338106b0)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/fs.h>
5 #include <linux/file.h>
6 #include <linux/mm.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
12 #include <linux/io_uring/cmd.h>
13 
14 #include <uapi/linux/io_uring.h>
15 
16 #include "filetable.h"
17 #include "io_uring.h"
18 #include "openclose.h"
19 #include "rsrc.h"
20 #include "memmap.h"
21 #include "register.h"
22 
23 struct io_rsrc_update {
24 	struct file			*file;
25 	u64				arg;
26 	u32				nr_args;
27 	u32				offset;
28 };
29 
30 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
31 			struct iovec *iov, struct page **last_hpage);
32 
33 /* only define max */
34 #define IORING_MAX_FIXED_FILES	(1U << 20)
35 #define IORING_MAX_REG_BUFFERS	(1U << 14)
36 
37 #define IO_CACHED_BVECS_SEGS	32
38 
39 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
40 {
41 	unsigned long page_limit, cur_pages, new_pages;
42 
43 	if (!nr_pages)
44 		return 0;
45 
46 	/* Don't allow more pages than we can safely lock */
47 	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
48 
49 	cur_pages = atomic_long_read(&user->locked_vm);
50 	do {
51 		new_pages = cur_pages + nr_pages;
52 		if (new_pages > page_limit)
53 			return -ENOMEM;
54 	} while (!atomic_long_try_cmpxchg(&user->locked_vm,
55 					  &cur_pages, new_pages));
56 	return 0;
57 }
58 
59 void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account,
60 		      unsigned long nr_pages)
61 {
62 	if (user)
63 		__io_unaccount_mem(user, nr_pages);
64 
65 	if (mm_account)
66 		atomic64_sub(nr_pages, &mm_account->pinned_vm);
67 }
68 
69 int io_account_mem(struct user_struct *user, struct mm_struct *mm_account,
70 		   unsigned long nr_pages)
71 {
72 	int ret;
73 
74 	if (user) {
75 		ret = __io_account_mem(user, nr_pages);
76 		if (ret)
77 			return ret;
78 	}
79 
80 	if (mm_account)
81 		atomic64_add(nr_pages, &mm_account->pinned_vm);
82 
83 	return 0;
84 }
85 
86 int io_validate_user_buf_range(u64 uaddr, u64 ulen)
87 {
88 	unsigned long tmp, base = (unsigned long)uaddr;
89 	unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen);
90 
91 	/* arbitrary limit, but we need something */
92 	if (ulen > SZ_1G || !ulen)
93 		return -EFAULT;
94 	if (check_add_overflow(base, acct_len, &tmp))
95 		return -EOVERFLOW;
96 	return 0;
97 }
98 
99 static void io_release_ubuf(void *priv)
100 {
101 	struct io_mapped_ubuf *imu = priv;
102 	unsigned int i;
103 
104 	for (i = 0; i < imu->nr_bvecs; i++) {
105 		struct folio *folio = page_folio(imu->bvec[i].bv_page);
106 
107 		unpin_user_folio(folio, 1);
108 	}
109 }
110 
111 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
112 					   int nr_bvecs)
113 {
114 	if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
115 		return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL);
116 	return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
117 			GFP_KERNEL);
118 }
119 
120 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
121 {
122 	if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS)
123 		io_cache_free(&ctx->imu_cache, imu);
124 	else
125 		kvfree(imu);
126 }
127 
128 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
129 {
130 	if (unlikely(refcount_read(&imu->refs) > 1)) {
131 		if (!refcount_dec_and_test(&imu->refs))
132 			return;
133 	}
134 
135 	if (imu->acct_pages)
136 		io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages);
137 	imu->release(imu->priv);
138 	io_free_imu(ctx, imu);
139 }
140 
141 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
142 {
143 	struct io_rsrc_node *node;
144 
145 	node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL);
146 	if (node) {
147 		node->type = type;
148 		node->refs = 1;
149 		node->tag = 0;
150 		node->file_ptr = 0;
151 	}
152 	return node;
153 }
154 
155 bool io_rsrc_cache_init(struct io_ring_ctx *ctx)
156 {
157 	const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
158 						 IO_CACHED_BVECS_SEGS);
159 	const int node_size = sizeof(struct io_rsrc_node);
160 	bool ret;
161 
162 	ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX,
163 				  node_size, 0);
164 	ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX,
165 				   imu_cache_size, 0);
166 	return ret;
167 }
168 
169 void io_rsrc_cache_free(struct io_ring_ctx *ctx)
170 {
171 	io_alloc_cache_free(&ctx->node_cache, kfree);
172 	io_alloc_cache_free(&ctx->imu_cache, kfree);
173 }
174 
175 static void io_clear_table_tags(struct io_rsrc_data *data)
176 {
177 	int i;
178 
179 	for (i = 0; i < data->nr; i++) {
180 		struct io_rsrc_node *node = data->nodes[i];
181 
182 		if (node)
183 			node->tag = 0;
184 	}
185 }
186 
187 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
188 			      struct io_rsrc_data *data)
189 {
190 	if (!data->nr)
191 		return;
192 	while (data->nr--) {
193 		if (data->nodes[data->nr])
194 			io_put_rsrc_node(ctx, data->nodes[data->nr]);
195 	}
196 	kvfree(data->nodes);
197 	data->nodes = NULL;
198 	data->nr = 0;
199 }
200 
201 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
202 {
203 	data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
204 					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
205 	if (data->nodes) {
206 		data->nr = nr;
207 		return 0;
208 	}
209 	return -ENOMEM;
210 }
211 
212 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
213 				 struct io_uring_rsrc_update2 *up,
214 				 unsigned nr_args)
215 {
216 	u64 __user *tags = u64_to_user_ptr(up->tags);
217 	__s32 __user *fds = u64_to_user_ptr(up->data);
218 	int fd, i, err = 0;
219 	unsigned int done;
220 
221 	if (!ctx->file_table.data.nr)
222 		return -ENXIO;
223 	if (up->offset + nr_args > ctx->file_table.data.nr)
224 		return -EINVAL;
225 
226 	for (done = 0; done < nr_args; done++) {
227 		u64 tag = 0;
228 
229 		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
230 		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
231 			err = -EFAULT;
232 			break;
233 		}
234 		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
235 			err = -EINVAL;
236 			break;
237 		}
238 		if (fd == IORING_REGISTER_FILES_SKIP)
239 			continue;
240 
241 		i = up->offset + done;
242 		if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
243 			io_file_bitmap_clear(&ctx->file_table, i);
244 
245 		if (fd != -1) {
246 			struct file *file = fget(fd);
247 			struct io_rsrc_node *node;
248 
249 			if (!file) {
250 				err = -EBADF;
251 				break;
252 			}
253 			/*
254 			 * Don't allow io_uring instances to be registered.
255 			 */
256 			if (io_is_uring_fops(file)) {
257 				fput(file);
258 				err = -EBADF;
259 				break;
260 			}
261 			node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
262 			if (!node) {
263 				err = -ENOMEM;
264 				fput(file);
265 				break;
266 			}
267 			ctx->file_table.data.nodes[i] = node;
268 			if (tag)
269 				node->tag = tag;
270 			io_fixed_file_set(node, file);
271 			io_file_bitmap_set(&ctx->file_table, i);
272 		}
273 	}
274 	return done ? done : err;
275 }
276 
277 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
278 				   struct io_uring_rsrc_update2 *up,
279 				   unsigned int nr_args)
280 {
281 	u64 __user *tags = u64_to_user_ptr(up->tags);
282 	struct iovec fast_iov, *iov;
283 	struct page *last_hpage = NULL;
284 	struct iovec __user *uvec;
285 	u64 user_data = up->data;
286 	__u32 done;
287 	int i, err;
288 
289 	if (!ctx->buf_table.nr)
290 		return -ENXIO;
291 	if (up->offset + nr_args > ctx->buf_table.nr)
292 		return -EINVAL;
293 
294 	for (done = 0; done < nr_args; done++) {
295 		struct io_rsrc_node *node;
296 		u64 tag = 0;
297 
298 		uvec = u64_to_user_ptr(user_data);
299 		iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
300 		if (IS_ERR(iov)) {
301 			err = PTR_ERR(iov);
302 			break;
303 		}
304 		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
305 			err = -EFAULT;
306 			break;
307 		}
308 		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
309 		if (IS_ERR(node)) {
310 			err = PTR_ERR(node);
311 			break;
312 		}
313 		if (tag) {
314 			if (!node) {
315 				err = -EINVAL;
316 				break;
317 			}
318 			node->tag = tag;
319 		}
320 		i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
321 		io_reset_rsrc_node(ctx, &ctx->buf_table, i);
322 		ctx->buf_table.nodes[i] = node;
323 		if (ctx->compat)
324 			user_data += sizeof(struct compat_iovec);
325 		else
326 			user_data += sizeof(struct iovec);
327 	}
328 	return done ? done : err;
329 }
330 
331 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
332 				     struct io_uring_rsrc_update2 *up,
333 				     unsigned nr_args)
334 {
335 	__u32 tmp;
336 
337 	lockdep_assert_held(&ctx->uring_lock);
338 
339 	if (check_add_overflow(up->offset, nr_args, &tmp))
340 		return -EOVERFLOW;
341 
342 	switch (type) {
343 	case IORING_RSRC_FILE:
344 		return __io_sqe_files_update(ctx, up, nr_args);
345 	case IORING_RSRC_BUFFER:
346 		return __io_sqe_buffers_update(ctx, up, nr_args);
347 	}
348 	return -EINVAL;
349 }
350 
351 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
352 			     unsigned nr_args)
353 {
354 	struct io_uring_rsrc_update2 up;
355 
356 	if (!nr_args)
357 		return -EINVAL;
358 	memset(&up, 0, sizeof(up));
359 	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
360 		return -EFAULT;
361 	if (up.resv || up.resv2)
362 		return -EINVAL;
363 	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
364 }
365 
366 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
367 			    unsigned size, unsigned type)
368 {
369 	struct io_uring_rsrc_update2 up;
370 
371 	if (size != sizeof(up))
372 		return -EINVAL;
373 	if (copy_from_user(&up, arg, sizeof(up)))
374 		return -EFAULT;
375 	if (!up.nr || up.resv || up.resv2)
376 		return -EINVAL;
377 	return __io_register_rsrc_update(ctx, type, &up, up.nr);
378 }
379 
380 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
381 			    unsigned int size, unsigned int type)
382 {
383 	struct io_uring_rsrc_register rr;
384 
385 	/* keep it extendible */
386 	if (size != sizeof(rr))
387 		return -EINVAL;
388 
389 	memset(&rr, 0, sizeof(rr));
390 	if (copy_from_user(&rr, arg, size))
391 		return -EFAULT;
392 	if (!rr.nr || rr.resv2)
393 		return -EINVAL;
394 	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
395 		return -EINVAL;
396 
397 	switch (type) {
398 	case IORING_RSRC_FILE:
399 		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
400 			break;
401 		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
402 					     rr.nr, u64_to_user_ptr(rr.tags));
403 	case IORING_RSRC_BUFFER:
404 		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
405 			break;
406 		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
407 					       rr.nr, u64_to_user_ptr(rr.tags));
408 	}
409 	return -EINVAL;
410 }
411 
412 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
413 {
414 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
415 
416 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
417 		return -EINVAL;
418 	if (sqe->rw_flags || sqe->splice_fd_in)
419 		return -EINVAL;
420 
421 	up->offset = READ_ONCE(sqe->off);
422 	up->nr_args = READ_ONCE(sqe->len);
423 	if (!up->nr_args)
424 		return -EINVAL;
425 	up->arg = READ_ONCE(sqe->addr);
426 	return 0;
427 }
428 
429 static int io_files_update_with_index_alloc(struct io_kiocb *req,
430 					    unsigned int issue_flags)
431 {
432 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
433 	__s32 __user *fds = u64_to_user_ptr(up->arg);
434 	unsigned int done;
435 	struct file *file;
436 	int ret, fd;
437 
438 	if (!req->ctx->file_table.data.nr)
439 		return -ENXIO;
440 
441 	for (done = 0; done < up->nr_args; done++) {
442 		if (get_user(fd, &fds[done])) {
443 			ret = -EFAULT;
444 			break;
445 		}
446 
447 		file = fget(fd);
448 		if (!file) {
449 			ret = -EBADF;
450 			break;
451 		}
452 		ret = io_fixed_fd_install(req, issue_flags, file,
453 					  IORING_FILE_INDEX_ALLOC);
454 		if (ret < 0)
455 			break;
456 		if (put_user(ret, &fds[done])) {
457 			__io_close_fixed(req->ctx, issue_flags, ret);
458 			ret = -EFAULT;
459 			break;
460 		}
461 	}
462 
463 	if (done)
464 		return done;
465 	return ret;
466 }
467 
468 int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
469 {
470 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
471 	struct io_ring_ctx *ctx = req->ctx;
472 	struct io_uring_rsrc_update2 up2;
473 	int ret;
474 
475 	up2.offset = up->offset;
476 	up2.data = up->arg;
477 	up2.nr = 0;
478 	up2.tags = 0;
479 	up2.resv = 0;
480 	up2.resv2 = 0;
481 
482 	if (up->offset == IORING_FILE_INDEX_ALLOC) {
483 		ret = io_files_update_with_index_alloc(req, issue_flags);
484 	} else {
485 		io_ring_submit_lock(ctx, issue_flags);
486 		ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
487 						&up2, up->nr_args);
488 		io_ring_submit_unlock(ctx, issue_flags);
489 	}
490 
491 	if (ret < 0)
492 		req_set_fail(req);
493 	io_req_set_res(req, ret, 0);
494 	return IOU_COMPLETE;
495 }
496 
497 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
498 {
499 	if (node->tag)
500 		io_post_aux_cqe(ctx, node->tag, 0, 0);
501 
502 	switch (node->type) {
503 	case IORING_RSRC_FILE:
504 		fput(io_slot_file(node));
505 		break;
506 	case IORING_RSRC_BUFFER:
507 		io_buffer_unmap(ctx, node->buf);
508 		break;
509 	default:
510 		WARN_ON_ONCE(1);
511 		break;
512 	}
513 
514 	io_cache_free(&ctx->node_cache, node);
515 }
516 
517 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
518 {
519 	if (!ctx->file_table.data.nr)
520 		return -ENXIO;
521 
522 	io_free_file_tables(ctx, &ctx->file_table);
523 	io_file_table_set_alloc_range(ctx, 0, 0);
524 	return 0;
525 }
526 
527 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
528 			  unsigned nr_args, u64 __user *tags)
529 {
530 	__s32 __user *fds = (__s32 __user *) arg;
531 	struct file *file;
532 	int fd, ret;
533 	unsigned i;
534 
535 	if (ctx->file_table.data.nr)
536 		return -EBUSY;
537 	if (!nr_args)
538 		return -EINVAL;
539 	if (nr_args > IORING_MAX_FIXED_FILES)
540 		return -EMFILE;
541 	if (nr_args > rlimit(RLIMIT_NOFILE))
542 		return -EMFILE;
543 	if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args))
544 		return -ENOMEM;
545 
546 	for (i = 0; i < nr_args; i++) {
547 		struct io_rsrc_node *node;
548 		u64 tag = 0;
549 
550 		ret = -EFAULT;
551 		if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
552 			goto fail;
553 		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
554 			goto fail;
555 		/* allow sparse sets */
556 		if (!fds || fd == -1) {
557 			ret = -EINVAL;
558 			if (tag)
559 				goto fail;
560 			continue;
561 		}
562 
563 		file = fget(fd);
564 		ret = -EBADF;
565 		if (unlikely(!file))
566 			goto fail;
567 
568 		/*
569 		 * Don't allow io_uring instances to be registered.
570 		 */
571 		if (io_is_uring_fops(file)) {
572 			fput(file);
573 			goto fail;
574 		}
575 		ret = -ENOMEM;
576 		node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
577 		if (!node) {
578 			fput(file);
579 			goto fail;
580 		}
581 		if (tag)
582 			node->tag = tag;
583 		ctx->file_table.data.nodes[i] = node;
584 		io_fixed_file_set(node, file);
585 		io_file_bitmap_set(&ctx->file_table, i);
586 	}
587 
588 	/* default it to the whole table */
589 	io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
590 	return 0;
591 fail:
592 	io_clear_table_tags(&ctx->file_table.data);
593 	io_sqe_files_unregister(ctx);
594 	return ret;
595 }
596 
597 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
598 {
599 	if (!ctx->buf_table.nr)
600 		return -ENXIO;
601 	io_rsrc_data_free(ctx, &ctx->buf_table);
602 	return 0;
603 }
604 
605 /*
606  * Not super efficient, but this is just a registration time. And we do cache
607  * the last compound head, so generally we'll only do a full search if we don't
608  * match that one.
609  *
610  * We check if the given compound head page has already been accounted, to
611  * avoid double accounting it. This allows us to account the full size of the
612  * page, not just the constituent pages of a huge page.
613  */
614 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
615 				  int nr_pages, struct page *hpage)
616 {
617 	int i, j;
618 
619 	/* check current page array */
620 	for (i = 0; i < nr_pages; i++) {
621 		if (!PageCompound(pages[i]))
622 			continue;
623 		if (compound_head(pages[i]) == hpage)
624 			return true;
625 	}
626 
627 	/* check previously registered pages */
628 	for (i = 0; i < ctx->buf_table.nr; i++) {
629 		struct io_rsrc_node *node = ctx->buf_table.nodes[i];
630 		struct io_mapped_ubuf *imu;
631 
632 		if (!node)
633 			continue;
634 		imu = node->buf;
635 		for (j = 0; j < imu->nr_bvecs; j++) {
636 			if (!PageCompound(imu->bvec[j].bv_page))
637 				continue;
638 			if (compound_head(imu->bvec[j].bv_page) == hpage)
639 				return true;
640 		}
641 	}
642 
643 	return false;
644 }
645 
646 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
647 				 int nr_pages, struct io_mapped_ubuf *imu,
648 				 struct page **last_hpage)
649 {
650 	int i, ret;
651 
652 	imu->acct_pages = 0;
653 	for (i = 0; i < nr_pages; i++) {
654 		if (!PageCompound(pages[i])) {
655 			imu->acct_pages++;
656 		} else {
657 			struct page *hpage;
658 
659 			hpage = compound_head(pages[i]);
660 			if (hpage == *last_hpage)
661 				continue;
662 			*last_hpage = hpage;
663 			if (headpage_already_acct(ctx, pages, i, hpage))
664 				continue;
665 			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
666 		}
667 	}
668 
669 	if (!imu->acct_pages)
670 		return 0;
671 
672 	ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages);
673 	if (ret)
674 		imu->acct_pages = 0;
675 	return ret;
676 }
677 
678 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
679 				struct io_imu_folio_data *data)
680 {
681 	struct page **page_array = *pages, **new_array = NULL;
682 	unsigned nr_pages_left = *nr_pages;
683 	unsigned nr_folios = data->nr_folios;
684 	unsigned i, j;
685 
686 	/* Store head pages only*/
687 	new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL);
688 	if (!new_array)
689 		return false;
690 
691 	for (i = 0, j = 0; i < nr_folios; i++) {
692 		struct page *p = compound_head(page_array[j]);
693 		struct folio *folio = page_folio(p);
694 		unsigned int nr;
695 
696 		WARN_ON_ONCE(i > 0 && p != page_array[j]);
697 
698 		nr = i ? data->nr_pages_mid : data->nr_pages_head;
699 		nr = min(nr, nr_pages_left);
700 		/* Drop all but one ref, the entire folio will remain pinned. */
701 		if (nr > 1)
702 			unpin_user_folio(folio, nr - 1);
703 		j += nr;
704 		nr_pages_left -= nr;
705 		new_array[i] = p;
706 	}
707 
708 	WARN_ON_ONCE(j != *nr_pages);
709 
710 	kvfree(page_array);
711 	*pages = new_array;
712 	*nr_pages = nr_folios;
713 	return true;
714 }
715 
716 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
717 			      struct io_imu_folio_data *data)
718 {
719 	struct folio *folio = page_folio(page_array[0]);
720 	unsigned int count = 1, nr_folios = 1;
721 	int i;
722 
723 	data->nr_pages_mid = folio_nr_pages(folio);
724 	data->folio_shift = folio_shift(folio);
725 	data->first_folio_page_idx = folio_page_idx(folio, page_array[0]);
726 
727 	/*
728 	 * Check if pages are contiguous inside a folio, and all folios have
729 	 * the same page count except for the head and tail.
730 	 */
731 	for (i = 1; i < nr_pages; i++) {
732 		if (page_folio(page_array[i]) == folio &&
733 			page_array[i] == page_array[i-1] + 1) {
734 			count++;
735 			continue;
736 		}
737 
738 		if (nr_folios == 1) {
739 			if (folio_page_idx(folio, page_array[i-1]) !=
740 				data->nr_pages_mid - 1)
741 				return false;
742 
743 			data->nr_pages_head = count;
744 		} else if (count != data->nr_pages_mid) {
745 			return false;
746 		}
747 
748 		folio = page_folio(page_array[i]);
749 		if (folio_size(folio) != (1UL << data->folio_shift) ||
750 			folio_page_idx(folio, page_array[i]) != 0)
751 			return false;
752 
753 		count = 1;
754 		nr_folios++;
755 	}
756 	if (nr_folios == 1)
757 		data->nr_pages_head = count;
758 
759 	data->nr_folios = nr_folios;
760 	return true;
761 }
762 
763 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
764 						   struct iovec *iov,
765 						   struct page **last_hpage)
766 {
767 	struct io_mapped_ubuf *imu = NULL;
768 	struct page **pages = NULL;
769 	struct io_rsrc_node *node;
770 	unsigned long off;
771 	size_t size;
772 	int ret, nr_pages, i;
773 	struct io_imu_folio_data data;
774 	bool coalesced = false;
775 
776 	if (!iov->iov_base) {
777 		if (iov->iov_len)
778 			return ERR_PTR(-EFAULT);
779 		/* remove the buffer without installing a new one */
780 		return NULL;
781 	}
782 
783 	ret = io_validate_user_buf_range((unsigned long)iov->iov_base,
784 					 iov->iov_len);
785 	if (ret)
786 		return ERR_PTR(ret);
787 
788 	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
789 	if (!node)
790 		return ERR_PTR(-ENOMEM);
791 
792 	ret = -ENOMEM;
793 	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
794 				&nr_pages);
795 	if (IS_ERR(pages)) {
796 		ret = PTR_ERR(pages);
797 		pages = NULL;
798 		goto done;
799 	}
800 
801 	/* If it's huge page(s), try to coalesce them into fewer bvec entries */
802 	if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) {
803 		if (data.nr_pages_mid != 1)
804 			coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
805 	}
806 
807 	imu = io_alloc_imu(ctx, nr_pages);
808 	if (!imu)
809 		goto done;
810 
811 	imu->nr_bvecs = nr_pages;
812 	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
813 	if (ret)
814 		goto done;
815 
816 	size = iov->iov_len;
817 	/* store original address for later verification */
818 	imu->ubuf = (unsigned long) iov->iov_base;
819 	imu->len = iov->iov_len;
820 	imu->folio_shift = PAGE_SHIFT;
821 	imu->release = io_release_ubuf;
822 	imu->priv = imu;
823 	imu->flags = 0;
824 	imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
825 	if (coalesced)
826 		imu->folio_shift = data.folio_shift;
827 	refcount_set(&imu->refs, 1);
828 
829 	off = (unsigned long)iov->iov_base & ~PAGE_MASK;
830 	if (coalesced)
831 		off += data.first_folio_page_idx << PAGE_SHIFT;
832 
833 	node->buf = imu;
834 	ret = 0;
835 
836 	for (i = 0; i < nr_pages; i++) {
837 		size_t vec_len;
838 
839 		vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
840 		bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
841 		off = 0;
842 		size -= vec_len;
843 	}
844 done:
845 	if (ret) {
846 		if (imu)
847 			io_free_imu(ctx, imu);
848 		if (pages) {
849 			for (i = 0; i < nr_pages; i++)
850 				unpin_user_folio(page_folio(pages[i]), 1);
851 		}
852 		io_cache_free(&ctx->node_cache, node);
853 		node = ERR_PTR(ret);
854 	}
855 	kvfree(pages);
856 	return node;
857 }
858 
859 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
860 			    unsigned int nr_args, u64 __user *tags)
861 {
862 	struct page *last_hpage = NULL;
863 	struct io_rsrc_data data;
864 	struct iovec fast_iov, *iov = &fast_iov;
865 	const struct iovec __user *uvec;
866 	int i, ret;
867 
868 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
869 
870 	if (ctx->buf_table.nr)
871 		return -EBUSY;
872 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
873 		return -EINVAL;
874 	ret = io_rsrc_data_alloc(&data, nr_args);
875 	if (ret)
876 		return ret;
877 
878 	if (!arg)
879 		memset(iov, 0, sizeof(*iov));
880 
881 	for (i = 0; i < nr_args; i++) {
882 		struct io_rsrc_node *node;
883 		u64 tag = 0;
884 
885 		if (arg) {
886 			uvec = (struct iovec __user *) arg;
887 			iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
888 			if (IS_ERR(iov)) {
889 				ret = PTR_ERR(iov);
890 				break;
891 			}
892 			if (ctx->compat)
893 				arg += sizeof(struct compat_iovec);
894 			else
895 				arg += sizeof(struct iovec);
896 		}
897 
898 		if (tags) {
899 			if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
900 				ret = -EFAULT;
901 				break;
902 			}
903 		}
904 
905 		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
906 		if (IS_ERR(node)) {
907 			ret = PTR_ERR(node);
908 			break;
909 		}
910 		if (tag) {
911 			if (!node) {
912 				ret = -EINVAL;
913 				break;
914 			}
915 			node->tag = tag;
916 		}
917 		data.nodes[i] = node;
918 	}
919 
920 	ctx->buf_table = data;
921 	if (ret) {
922 		io_clear_table_tags(&ctx->buf_table);
923 		io_sqe_buffers_unregister(ctx);
924 	}
925 	return ret;
926 }
927 
928 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
929 			    void (*release)(void *), unsigned int index,
930 			    unsigned int issue_flags)
931 {
932 	struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
933 	struct io_rsrc_data *data = &ctx->buf_table;
934 	struct req_iterator rq_iter;
935 	struct io_mapped_ubuf *imu;
936 	struct io_rsrc_node *node;
937 	struct bio_vec bv;
938 	unsigned int nr_bvecs = 0;
939 	int ret = 0;
940 
941 	io_ring_submit_lock(ctx, issue_flags);
942 	if (index >= data->nr) {
943 		ret = -EINVAL;
944 		goto unlock;
945 	}
946 	index = array_index_nospec(index, data->nr);
947 
948 	if (data->nodes[index]) {
949 		ret = -EBUSY;
950 		goto unlock;
951 	}
952 
953 	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
954 	if (!node) {
955 		ret = -ENOMEM;
956 		goto unlock;
957 	}
958 
959 	/*
960 	 * blk_rq_nr_phys_segments() may overestimate the number of bvecs
961 	 * but avoids needing to iterate over the bvecs
962 	 */
963 	imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq));
964 	if (!imu) {
965 		kfree(node);
966 		ret = -ENOMEM;
967 		goto unlock;
968 	}
969 
970 	imu->ubuf = 0;
971 	imu->len = blk_rq_bytes(rq);
972 	imu->acct_pages = 0;
973 	imu->folio_shift = PAGE_SHIFT;
974 	refcount_set(&imu->refs, 1);
975 	imu->release = release;
976 	imu->priv = rq;
977 	imu->flags = IO_REGBUF_F_KBUF;
978 	imu->dir = 1 << rq_data_dir(rq);
979 
980 	rq_for_each_bvec(bv, rq, rq_iter)
981 		imu->bvec[nr_bvecs++] = bv;
982 	imu->nr_bvecs = nr_bvecs;
983 
984 	node->buf = imu;
985 	data->nodes[index] = node;
986 unlock:
987 	io_ring_submit_unlock(ctx, issue_flags);
988 	return ret;
989 }
990 EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
991 
992 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
993 			      unsigned int issue_flags)
994 {
995 	struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
996 	struct io_rsrc_data *data = &ctx->buf_table;
997 	struct io_rsrc_node *node;
998 	int ret = 0;
999 
1000 	io_ring_submit_lock(ctx, issue_flags);
1001 	if (index >= data->nr) {
1002 		ret = -EINVAL;
1003 		goto unlock;
1004 	}
1005 	index = array_index_nospec(index, data->nr);
1006 
1007 	node = data->nodes[index];
1008 	if (!node) {
1009 		ret = -EINVAL;
1010 		goto unlock;
1011 	}
1012 	if (!(node->buf->flags & IO_REGBUF_F_KBUF)) {
1013 		ret = -EBUSY;
1014 		goto unlock;
1015 	}
1016 
1017 	io_put_rsrc_node(ctx, node);
1018 	data->nodes[index] = NULL;
1019 unlock:
1020 	io_ring_submit_unlock(ctx, issue_flags);
1021 	return ret;
1022 }
1023 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
1024 
1025 static int validate_fixed_range(u64 buf_addr, size_t len,
1026 				const struct io_mapped_ubuf *imu)
1027 {
1028 	u64 buf_end;
1029 
1030 	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1031 		return -EFAULT;
1032 	/* not inside the mapped region */
1033 	if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
1034 		return -EFAULT;
1035 	if (unlikely(len > MAX_RW_COUNT))
1036 		return -EFAULT;
1037 	return 0;
1038 }
1039 
1040 static int io_import_kbuf(int ddir, struct iov_iter *iter,
1041 			  struct io_mapped_ubuf *imu, size_t len, size_t offset)
1042 {
1043 	size_t count = len + offset;
1044 
1045 	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
1046 	iov_iter_advance(iter, offset);
1047 	return 0;
1048 }
1049 
1050 static int io_import_fixed(int ddir, struct iov_iter *iter,
1051 			   struct io_mapped_ubuf *imu,
1052 			   u64 buf_addr, size_t len)
1053 {
1054 	const struct bio_vec *bvec;
1055 	size_t folio_mask;
1056 	unsigned nr_segs;
1057 	size_t offset;
1058 	int ret;
1059 
1060 	ret = validate_fixed_range(buf_addr, len, imu);
1061 	if (unlikely(ret))
1062 		return ret;
1063 	if (!(imu->dir & (1 << ddir)))
1064 		return -EFAULT;
1065 
1066 	offset = buf_addr - imu->ubuf;
1067 
1068 	if (imu->flags & IO_REGBUF_F_KBUF)
1069 		return io_import_kbuf(ddir, iter, imu, len, offset);
1070 
1071 	/*
1072 	 * Don't use iov_iter_advance() here, as it's really slow for
1073 	 * using the latter parts of a big fixed buffer - it iterates
1074 	 * over each segment manually. We can cheat a bit here for user
1075 	 * registered nodes, because we know that:
1076 	 *
1077 	 * 1) it's a BVEC iter, we set it up
1078 	 * 2) all bvecs are the same in size, except potentially the
1079 	 *    first and last bvec
1080 	 */
1081 	folio_mask = (1UL << imu->folio_shift) - 1;
1082 	bvec = imu->bvec;
1083 	if (offset >= bvec->bv_len) {
1084 		unsigned long seg_skip;
1085 
1086 		/* skip first vec */
1087 		offset -= bvec->bv_len;
1088 		seg_skip = 1 + (offset >> imu->folio_shift);
1089 		bvec += seg_skip;
1090 		offset &= folio_mask;
1091 	}
1092 	nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
1093 	iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
1094 	iter->iov_offset = offset;
1095 	return 0;
1096 }
1097 
1098 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
1099 					     unsigned issue_flags)
1100 {
1101 	struct io_ring_ctx *ctx = req->ctx;
1102 	struct io_rsrc_node *node;
1103 
1104 	if (req->flags & REQ_F_BUF_NODE)
1105 		return req->buf_node;
1106 	req->flags |= REQ_F_BUF_NODE;
1107 
1108 	io_ring_submit_lock(ctx, issue_flags);
1109 	node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
1110 	if (node) {
1111 		node->refs++;
1112 		req->buf_node = node;
1113 		io_ring_submit_unlock(ctx, issue_flags);
1114 		return node;
1115 	}
1116 	req->flags &= ~REQ_F_BUF_NODE;
1117 	io_ring_submit_unlock(ctx, issue_flags);
1118 	return NULL;
1119 }
1120 
1121 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
1122 			u64 buf_addr, size_t len, int ddir,
1123 			unsigned issue_flags)
1124 {
1125 	struct io_rsrc_node *node;
1126 
1127 	node = io_find_buf_node(req, issue_flags);
1128 	if (!node)
1129 		return -EFAULT;
1130 	return io_import_fixed(ddir, iter, node->buf, buf_addr, len);
1131 }
1132 
1133 /* Lock two rings at once. The rings must be different! */
1134 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
1135 {
1136 	if (ctx1 > ctx2)
1137 		swap(ctx1, ctx2);
1138 	mutex_lock(&ctx1->uring_lock);
1139 	mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING);
1140 }
1141 
1142 /* Both rings are locked by the caller. */
1143 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
1144 			    struct io_uring_clone_buffers *arg)
1145 {
1146 	struct io_rsrc_data data;
1147 	int i, ret, off, nr;
1148 	unsigned int nbufs;
1149 
1150 	lockdep_assert_held(&ctx->uring_lock);
1151 	lockdep_assert_held(&src_ctx->uring_lock);
1152 
1153 	/*
1154 	 * Accounting state is shared between the two rings; that only works if
1155 	 * both rings are accounted towards the same counters.
1156 	 */
1157 	if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
1158 		return -EINVAL;
1159 
1160 	/* if offsets are given, must have nr specified too */
1161 	if (!arg->nr && (arg->dst_off || arg->src_off))
1162 		return -EINVAL;
1163 	/* not allowed unless REPLACE is set */
1164 	if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
1165 		return -EBUSY;
1166 
1167 	nbufs = src_ctx->buf_table.nr;
1168 	if (!nbufs)
1169 		return -ENXIO;
1170 	if (!arg->nr)
1171 		arg->nr = nbufs;
1172 	else if (arg->nr > nbufs)
1173 		return -EINVAL;
1174 	else if (arg->nr > IORING_MAX_REG_BUFFERS)
1175 		return -EINVAL;
1176 	if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs)
1177 		return -EOVERFLOW;
1178 	if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
1179 		return -EOVERFLOW;
1180 	if (nbufs > IORING_MAX_REG_BUFFERS)
1181 		return -EINVAL;
1182 
1183 	ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
1184 	if (ret)
1185 		return ret;
1186 
1187 	/* Copy original dst nodes from before the cloned range */
1188 	for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
1189 		struct io_rsrc_node *node = ctx->buf_table.nodes[i];
1190 
1191 		if (node) {
1192 			data.nodes[i] = node;
1193 			node->refs++;
1194 		}
1195 	}
1196 
1197 	off = arg->dst_off;
1198 	i = arg->src_off;
1199 	nr = arg->nr;
1200 	while (nr--) {
1201 		struct io_rsrc_node *dst_node, *src_node;
1202 
1203 		src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
1204 		if (!src_node) {
1205 			dst_node = NULL;
1206 		} else {
1207 			dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
1208 			if (!dst_node) {
1209 				io_rsrc_data_free(ctx, &data);
1210 				return -ENOMEM;
1211 			}
1212 
1213 			refcount_inc(&src_node->buf->refs);
1214 			dst_node->buf = src_node->buf;
1215 		}
1216 		data.nodes[off++] = dst_node;
1217 		i++;
1218 	}
1219 
1220 	/* Copy original dst nodes from after the cloned range */
1221 	for (i = nbufs; i < ctx->buf_table.nr; i++) {
1222 		struct io_rsrc_node *node = ctx->buf_table.nodes[i];
1223 
1224 		if (node) {
1225 			data.nodes[i] = node;
1226 			node->refs++;
1227 		}
1228 	}
1229 
1230 	/*
1231 	 * If asked for replace, put the old table. data->nodes[] holds both
1232 	 * old and new nodes at this point.
1233 	 */
1234 	if (arg->flags & IORING_REGISTER_DST_REPLACE)
1235 		io_rsrc_data_free(ctx, &ctx->buf_table);
1236 
1237 	/*
1238 	 * ctx->buf_table must be empty now - either the contents are being
1239 	 * replaced and we just freed the table, or the contents are being
1240 	 * copied to a ring that does not have buffers yet (checked at function
1241 	 * entry).
1242 	 */
1243 	WARN_ON_ONCE(ctx->buf_table.nr);
1244 	ctx->buf_table = data;
1245 	return 0;
1246 }
1247 
1248 /*
1249  * Copy the registered buffers from the source ring whose file descriptor
1250  * is given in the src_fd to the current ring. This is identical to registering
1251  * the buffers with ctx, except faster as mappings already exist.
1252  *
1253  * Since the memory is already accounted once, don't account it again.
1254  */
1255 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
1256 {
1257 	struct io_uring_clone_buffers buf;
1258 	struct io_ring_ctx *src_ctx;
1259 	bool registered_src;
1260 	struct file *file;
1261 	int ret;
1262 
1263 	if (copy_from_user(&buf, arg, sizeof(buf)))
1264 		return -EFAULT;
1265 	if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
1266 		return -EINVAL;
1267 	if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
1268 		return -EBUSY;
1269 	if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
1270 		return -EINVAL;
1271 
1272 	registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
1273 	file = io_uring_register_get_file(buf.src_fd, registered_src);
1274 	if (IS_ERR(file))
1275 		return PTR_ERR(file);
1276 
1277 	src_ctx = file->private_data;
1278 	if (src_ctx != ctx) {
1279 		mutex_unlock(&ctx->uring_lock);
1280 		lock_two_rings(ctx, src_ctx);
1281 
1282 		if (src_ctx->submitter_task &&
1283 		    src_ctx->submitter_task != current) {
1284 			ret = -EEXIST;
1285 			goto out;
1286 		}
1287 	}
1288 
1289 	ret = io_clone_buffers(ctx, src_ctx, &buf);
1290 
1291 out:
1292 	if (src_ctx != ctx)
1293 		mutex_unlock(&src_ctx->uring_lock);
1294 
1295 	fput(file);
1296 	return ret;
1297 }
1298 
1299 void io_vec_free(struct iou_vec *iv)
1300 {
1301 	if (!iv->iovec)
1302 		return;
1303 	kfree(iv->iovec);
1304 	iv->iovec = NULL;
1305 	iv->nr = 0;
1306 }
1307 
1308 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries)
1309 {
1310 	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_NOWARN;
1311 	struct iovec *iov;
1312 
1313 	iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp);
1314 	if (!iov)
1315 		return -ENOMEM;
1316 
1317 	io_vec_free(iv);
1318 	iv->iovec = iov;
1319 	iv->nr = nr_entries;
1320 	return 0;
1321 }
1322 
1323 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
1324 				struct io_mapped_ubuf *imu,
1325 				struct iovec *iovec, unsigned nr_iovs,
1326 				struct iou_vec *vec)
1327 {
1328 	unsigned long folio_size = 1 << imu->folio_shift;
1329 	unsigned long folio_mask = folio_size - 1;
1330 	struct bio_vec *res_bvec = vec->bvec;
1331 	size_t total_len = 0;
1332 	unsigned bvec_idx = 0;
1333 	unsigned iov_idx;
1334 
1335 	for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
1336 		size_t iov_len = iovec[iov_idx].iov_len;
1337 		u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base;
1338 		struct bio_vec *src_bvec;
1339 		size_t offset;
1340 		int ret;
1341 
1342 		ret = validate_fixed_range(buf_addr, iov_len, imu);
1343 		if (unlikely(ret))
1344 			return ret;
1345 
1346 		if (unlikely(!iov_len))
1347 			return -EFAULT;
1348 		if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
1349 			return -EOVERFLOW;
1350 
1351 		offset = buf_addr - imu->ubuf;
1352 		/*
1353 		 * Only the first bvec can have non zero bv_offset, account it
1354 		 * here and work with full folios below.
1355 		 */
1356 		offset += imu->bvec[0].bv_offset;
1357 
1358 		src_bvec = imu->bvec + (offset >> imu->folio_shift);
1359 		offset &= folio_mask;
1360 
1361 		for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
1362 			size_t seg_size = min_t(size_t, iov_len,
1363 						folio_size - offset);
1364 
1365 			bvec_set_page(&res_bvec[bvec_idx],
1366 				      src_bvec->bv_page, seg_size, offset);
1367 			iov_len -= seg_size;
1368 		}
1369 	}
1370 	if (total_len > MAX_RW_COUNT)
1371 		return -EINVAL;
1372 
1373 	iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len);
1374 	return 0;
1375 }
1376 
1377 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs,
1378 				 struct io_mapped_ubuf *imu)
1379 {
1380 	unsigned shift = imu->folio_shift;
1381 	size_t max_segs = 0;
1382 	unsigned i;
1383 
1384 	for (i = 0; i < nr_iovs; i++) {
1385 		max_segs += (iov[i].iov_len >> shift) + 2;
1386 		if (max_segs > INT_MAX)
1387 			return -EOVERFLOW;
1388 	}
1389 	return max_segs;
1390 }
1391 
1392 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter,
1393 				 struct io_mapped_ubuf *imu,
1394 				 struct iovec *iovec, unsigned nr_iovs,
1395 				 struct iou_vec *vec)
1396 {
1397 	const struct bio_vec *src_bvec = imu->bvec;
1398 	struct bio_vec *res_bvec = vec->bvec;
1399 	unsigned res_idx = 0;
1400 	size_t total_len = 0;
1401 	unsigned iov_idx;
1402 
1403 	for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
1404 		size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base;
1405 		size_t iov_len = iovec[iov_idx].iov_len;
1406 		struct bvec_iter bi = {
1407 			.bi_size        = offset + iov_len,
1408 		};
1409 		struct bio_vec bv;
1410 
1411 		bvec_iter_advance(src_bvec, &bi, offset);
1412 		for_each_mp_bvec(bv, src_bvec, bi, bi)
1413 			res_bvec[res_idx++] = bv;
1414 		total_len += iov_len;
1415 	}
1416 	iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len);
1417 	return 0;
1418 }
1419 
1420 static int iov_kern_bvec_size(const struct iovec *iov,
1421 			      const struct io_mapped_ubuf *imu,
1422 			      unsigned int *nr_seg)
1423 {
1424 	size_t offset = (size_t)(uintptr_t)iov->iov_base;
1425 	const struct bio_vec *bvec = imu->bvec;
1426 	int start = 0, i = 0;
1427 	size_t off = 0;
1428 	int ret;
1429 
1430 	ret = validate_fixed_range(offset, iov->iov_len, imu);
1431 	if (unlikely(ret))
1432 		return ret;
1433 
1434 	for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs;
1435 			off += bvec[i].bv_len, i++) {
1436 		if (offset >= off && offset < off + bvec[i].bv_len)
1437 			start = i;
1438 	}
1439 	*nr_seg = i - start;
1440 	return 0;
1441 }
1442 
1443 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs,
1444 			     struct io_mapped_ubuf *imu, unsigned *nr_segs)
1445 {
1446 	unsigned max_segs = 0;
1447 	size_t total_len = 0;
1448 	unsigned i;
1449 	int ret;
1450 
1451 	*nr_segs = 0;
1452 	for (i = 0; i < nr_iovs; i++) {
1453 		if (unlikely(!iov[i].iov_len))
1454 			return -EFAULT;
1455 		if (unlikely(check_add_overflow(total_len, iov[i].iov_len,
1456 						&total_len)))
1457 			return -EOVERFLOW;
1458 		ret = iov_kern_bvec_size(&iov[i], imu, &max_segs);
1459 		if (unlikely(ret))
1460 			return ret;
1461 		*nr_segs += max_segs;
1462 	}
1463 	if (total_len > MAX_RW_COUNT)
1464 		return -EINVAL;
1465 	return 0;
1466 }
1467 
1468 int io_import_reg_vec(int ddir, struct iov_iter *iter,
1469 			struct io_kiocb *req, struct iou_vec *vec,
1470 			unsigned nr_iovs, unsigned issue_flags)
1471 {
1472 	struct io_rsrc_node *node;
1473 	struct io_mapped_ubuf *imu;
1474 	unsigned iovec_off;
1475 	struct iovec *iov;
1476 	unsigned nr_segs;
1477 
1478 	node = io_find_buf_node(req, issue_flags);
1479 	if (!node)
1480 		return -EFAULT;
1481 	imu = node->buf;
1482 	if (!(imu->dir & (1 << ddir)))
1483 		return -EFAULT;
1484 
1485 	iovec_off = vec->nr - nr_iovs;
1486 	iov = vec->iovec + iovec_off;
1487 
1488 	if (imu->flags & IO_REGBUF_F_KBUF) {
1489 		int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs);
1490 
1491 		if (unlikely(ret))
1492 			return ret;
1493 	} else {
1494 		int ret = io_estimate_bvec_size(iov, nr_iovs, imu);
1495 
1496 		if (ret < 0)
1497 			return ret;
1498 		nr_segs = ret;
1499 	}
1500 
1501 	if (sizeof(struct bio_vec) > sizeof(struct iovec)) {
1502 		size_t bvec_bytes;
1503 
1504 		bvec_bytes = nr_segs * sizeof(struct bio_vec);
1505 		nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov);
1506 		nr_segs += nr_iovs;
1507 	}
1508 
1509 	if (nr_segs > vec->nr) {
1510 		struct iou_vec tmp_vec = {};
1511 		int ret;
1512 
1513 		ret = io_vec_realloc(&tmp_vec, nr_segs);
1514 		if (ret)
1515 			return ret;
1516 
1517 		iovec_off = tmp_vec.nr - nr_iovs;
1518 		memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
1519 		io_vec_free(vec);
1520 
1521 		*vec = tmp_vec;
1522 		iov = vec->iovec + iovec_off;
1523 		req->flags |= REQ_F_NEED_CLEANUP;
1524 	}
1525 
1526 	if (imu->flags & IO_REGBUF_F_KBUF)
1527 		return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec);
1528 
1529 	return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
1530 }
1531 
1532 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv,
1533 		      const struct iovec __user *uvec, size_t uvec_segs)
1534 {
1535 	struct iovec *iov;
1536 	int iovec_off, ret;
1537 	void *res;
1538 
1539 	if (uvec_segs > iv->nr) {
1540 		ret = io_vec_realloc(iv, uvec_segs);
1541 		if (ret)
1542 			return ret;
1543 		req->flags |= REQ_F_NEED_CLEANUP;
1544 	}
1545 
1546 	/* pad iovec to the right */
1547 	iovec_off = iv->nr - uvec_segs;
1548 	iov = iv->iovec + iovec_off;
1549 	res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
1550 			      io_is_compat(req->ctx));
1551 	if (IS_ERR(res))
1552 		return PTR_ERR(res);
1553 
1554 	req->flags |= REQ_F_IMPORT_BUFFER;
1555 	return 0;
1556 }
1557