xref: /linux/io_uring/rsrc.c (revision ba9c792c824fff732df85119011d399d9b6d9155)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/fs.h>
5 #include <linux/file.h>
6 #include <linux/mm.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
12 #include <linux/io_uring/cmd.h>
13 
14 #include <uapi/linux/io_uring.h>
15 
16 #include "filetable.h"
17 #include "io_uring.h"
18 #include "openclose.h"
19 #include "rsrc.h"
20 #include "memmap.h"
21 #include "register.h"
22 
23 struct io_rsrc_update {
24 	struct file			*file;
25 	u64				arg;
26 	u32				nr_args;
27 	u32				offset;
28 };
29 
30 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
31 						   struct iovec *iov);
32 
33 static int hpage_acct_ref(struct io_ring_ctx *ctx, struct page *hpage,
34 			  bool *acct_new)
35 {
36 	unsigned long key = (unsigned long) hpage;
37 	unsigned long count;
38 	void *entry;
39 	int ret;
40 
41 	lockdep_assert_held(&ctx->uring_lock);
42 
43 	entry = xa_load(&ctx->hpage_acct, key);
44 	if (entry) {
45 		*acct_new = false;
46 		count = xa_to_value(entry) + 1;
47 	} else {
48 		ret = xa_reserve(&ctx->hpage_acct, key, GFP_KERNEL_ACCOUNT);
49 		if (ret)
50 			return ret;
51 		*acct_new = true;
52 		count = 1;
53 	}
54 	xa_store(&ctx->hpage_acct, key, xa_mk_value(count), GFP_KERNEL_ACCOUNT);
55 	return 0;
56 }
57 
58 static bool hpage_acct_unref(struct io_ring_ctx *ctx, struct page *hpage)
59 {
60 	unsigned long key = (unsigned long) hpage;
61 	unsigned long count;
62 	void *entry;
63 
64 	lockdep_assert_held(&ctx->uring_lock);
65 
66 	entry = xa_load(&ctx->hpage_acct, key);
67 	if (WARN_ON_ONCE(!entry))
68 		return false;
69 	count = xa_to_value(entry);
70 	if (count == 1) {
71 		xa_erase(&ctx->hpage_acct, key);
72 		return true;
73 	}
74 	xa_store(&ctx->hpage_acct, key, xa_mk_value(count - 1), GFP_KERNEL_ACCOUNT);
75 	return false;
76 }
77 
78 /* only define max */
79 #define IORING_MAX_FIXED_FILES	(1U << 20)
80 #define IORING_MAX_REG_BUFFERS	(1U << 14)
81 
82 #define IO_CACHED_BVECS_SEGS	32
83 
84 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
85 {
86 	unsigned long page_limit, cur_pages, new_pages;
87 
88 	if (!nr_pages)
89 		return 0;
90 
91 	/* Don't allow more pages than we can safely lock */
92 	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
93 
94 	cur_pages = atomic_long_read(&user->locked_vm);
95 	do {
96 		new_pages = cur_pages + nr_pages;
97 		if (new_pages > page_limit)
98 			return -ENOMEM;
99 	} while (!atomic_long_try_cmpxchg(&user->locked_vm,
100 					  &cur_pages, new_pages));
101 	return 0;
102 }
103 
104 void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account,
105 		      unsigned long nr_pages)
106 {
107 	if (user)
108 		__io_unaccount_mem(user, nr_pages);
109 
110 	if (mm_account)
111 		atomic64_sub(nr_pages, &mm_account->pinned_vm);
112 }
113 
114 int io_account_mem(struct user_struct *user, struct mm_struct *mm_account,
115 		   unsigned long nr_pages)
116 {
117 	int ret;
118 
119 	if (user) {
120 		ret = __io_account_mem(user, nr_pages);
121 		if (ret)
122 			return ret;
123 	}
124 
125 	if (mm_account)
126 		atomic64_add(nr_pages, &mm_account->pinned_vm);
127 
128 	return 0;
129 }
130 
131 int io_validate_user_buf_range(u64 uaddr, u64 ulen)
132 {
133 	unsigned long tmp, base = (unsigned long)uaddr;
134 	unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen);
135 
136 	if (!ulen)
137 		return -EFAULT;
138 	/* 32-bit sanity checking */
139 	if (ulen > ULONG_MAX || uaddr > ULONG_MAX)
140 		return -EFAULT;
141 	/* cap to 1TB for 64-bit */
142 	if (ulen > SZ_1T)
143 		return -EINVAL;
144 	if (check_add_overflow(base, acct_len, &tmp))
145 		return -EOVERFLOW;
146 	return 0;
147 }
148 
149 static void io_release_ubuf(void *priv)
150 {
151 	struct io_mapped_ubuf *imu = priv;
152 	unsigned int i;
153 
154 	for (i = 0; i < imu->nr_bvecs; i++) {
155 		struct folio *folio = bvec_folio(&imu->bvec[i]);
156 
157 		unpin_user_folio(folio, 1);
158 	}
159 }
160 
161 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
162 					   int nr_bvecs)
163 {
164 	if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
165 		return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL);
166 	return kvmalloc_flex(struct io_mapped_ubuf, bvec, nr_bvecs);
167 }
168 
169 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
170 {
171 	if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS)
172 		io_cache_free(&ctx->imu_cache, imu);
173 	else
174 		kvfree(imu);
175 }
176 
177 static unsigned long io_buffer_unaccount_pages(struct io_ring_ctx *ctx,
178 					       struct io_mapped_ubuf *imu)
179 {
180 	struct page *seen = NULL;
181 	unsigned long acct = 0;
182 	int i;
183 
184 	if (imu->flags & IO_REGBUF_F_KBUF || !ctx->user)
185 		return 0;
186 
187 	for (i = 0; i < imu->nr_bvecs; i++) {
188 		struct page *page = imu->bvec[i].bv_page;
189 		struct page *hpage;
190 
191 		if (!PageCompound(page)) {
192 			acct++;
193 			continue;
194 		}
195 
196 		hpage = compound_head(page);
197 		if (hpage == seen)
198 			continue;
199 		seen = hpage;
200 
201 		/* Unaccount on last reference */
202 		if (hpage_acct_unref(ctx, hpage))
203 			acct += page_size(hpage) >> PAGE_SHIFT;
204 		cond_resched();
205 	}
206 
207 	return acct;
208 }
209 
210 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
211 {
212 	unsigned long acct_pages = 0;
213 
214 	/* Always decrement, so it works for cloned buffers too */
215 	acct_pages = io_buffer_unaccount_pages(ctx, imu);
216 
217 	if (unlikely(refcount_read(&imu->refs) > 1)) {
218 		if (!refcount_dec_and_test(&imu->refs))
219 			return;
220 	}
221 
222 	if (acct_pages)
223 		io_unaccount_mem(ctx->user, ctx->mm_account, acct_pages);
224 	imu->release(imu->priv);
225 	io_free_imu(ctx, imu);
226 }
227 
228 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
229 {
230 	struct io_rsrc_node *node;
231 
232 	node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL);
233 	if (node) {
234 		node->type = type;
235 		node->refs = 1;
236 		node->tag = 0;
237 		node->file_ptr = 0;
238 	}
239 	return node;
240 }
241 
242 bool io_rsrc_cache_init(struct io_ring_ctx *ctx)
243 {
244 	const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
245 						 IO_CACHED_BVECS_SEGS);
246 	const int node_size = sizeof(struct io_rsrc_node);
247 	bool ret;
248 
249 	ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX,
250 				  node_size, 0);
251 	ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX,
252 				   imu_cache_size, 0);
253 	return ret;
254 }
255 
256 void io_rsrc_cache_free(struct io_ring_ctx *ctx)
257 {
258 	io_alloc_cache_free(&ctx->node_cache, kfree);
259 	io_alloc_cache_free(&ctx->imu_cache, kvfree);
260 }
261 
262 static void io_clear_table_tags(struct io_rsrc_data *data)
263 {
264 	int i;
265 
266 	for (i = 0; i < data->nr; i++) {
267 		struct io_rsrc_node *node = data->nodes[i];
268 
269 		if (node)
270 			node->tag = 0;
271 	}
272 }
273 
274 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
275 			      struct io_rsrc_data *data)
276 {
277 	if (!data->nr)
278 		return;
279 	while (data->nr--) {
280 		if (data->nodes[data->nr])
281 			io_put_rsrc_node(ctx, data->nodes[data->nr]);
282 	}
283 	kvfree(data->nodes);
284 	data->nodes = NULL;
285 	data->nr = 0;
286 }
287 
288 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
289 {
290 	data->nodes = kvmalloc_objs(struct io_rsrc_node *, nr,
291 				    GFP_KERNEL_ACCOUNT | __GFP_ZERO);
292 	if (data->nodes) {
293 		data->nr = nr;
294 		return 0;
295 	}
296 	return -ENOMEM;
297 }
298 
299 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
300 				 struct io_uring_rsrc_update2 *up,
301 				 unsigned nr_args)
302 {
303 	u64 __user *tags = u64_to_user_ptr(up->tags);
304 	__s32 __user *fds = u64_to_user_ptr(up->data);
305 	int fd, i, err = 0;
306 	unsigned int done;
307 
308 	if (!ctx->file_table.data.nr)
309 		return -ENXIO;
310 	if (up->offset + nr_args > ctx->file_table.data.nr)
311 		return -EINVAL;
312 
313 	for (done = 0; done < nr_args; done++) {
314 		u64 tag = 0;
315 
316 		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
317 		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
318 			err = -EFAULT;
319 			break;
320 		}
321 		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
322 			err = -EINVAL;
323 			break;
324 		}
325 		if (fd == IORING_REGISTER_FILES_SKIP)
326 			continue;
327 
328 		i = up->offset + done;
329 		if (i >= ctx->file_table.data.nr)
330 			break;
331 		i = array_index_nospec(i, ctx->file_table.data.nr);
332 		if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
333 			io_file_bitmap_clear(&ctx->file_table, i);
334 
335 		if (fd != -1) {
336 			struct file *file = fget(fd);
337 			struct io_rsrc_node *node;
338 
339 			if (!file) {
340 				err = -EBADF;
341 				break;
342 			}
343 			/*
344 			 * Don't allow io_uring instances to be registered.
345 			 */
346 			if (io_is_uring_fops(file)) {
347 				fput(file);
348 				err = -EBADF;
349 				break;
350 			}
351 			node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
352 			if (!node) {
353 				err = -ENOMEM;
354 				fput(file);
355 				break;
356 			}
357 			ctx->file_table.data.nodes[i] = node;
358 			if (tag)
359 				node->tag = tag;
360 			io_fixed_file_set(node, file);
361 			io_file_bitmap_set(&ctx->file_table, i);
362 		}
363 	}
364 	return done ? done : err;
365 }
366 
367 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
368 				   struct io_uring_rsrc_update2 *up,
369 				   unsigned int nr_args)
370 {
371 	u64 __user *tags = u64_to_user_ptr(up->tags);
372 	struct iovec fast_iov, *iov;
373 	struct iovec __user *uvec;
374 	u64 user_data = up->data;
375 	__u32 done;
376 	int i, err;
377 
378 	if (!ctx->buf_table.nr)
379 		return -ENXIO;
380 	if (up->offset + nr_args > ctx->buf_table.nr)
381 		return -EINVAL;
382 
383 	for (done = 0; done < nr_args; done++) {
384 		struct io_rsrc_node *node;
385 		u64 tag = 0;
386 
387 		uvec = u64_to_user_ptr(user_data);
388 		iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx));
389 		if (IS_ERR(iov)) {
390 			err = PTR_ERR(iov);
391 			break;
392 		}
393 		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
394 			err = -EFAULT;
395 			break;
396 		}
397 		node = io_sqe_buffer_register(ctx, iov);
398 		if (IS_ERR(node)) {
399 			err = PTR_ERR(node);
400 			break;
401 		}
402 		if (tag) {
403 			if (!node) {
404 				err = -EINVAL;
405 				break;
406 			}
407 			node->tag = tag;
408 		}
409 		i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
410 		io_reset_rsrc_node(ctx, &ctx->buf_table, i);
411 		ctx->buf_table.nodes[i] = node;
412 		if (io_is_compat(ctx))
413 			user_data += sizeof(struct compat_iovec);
414 		else
415 			user_data += sizeof(struct iovec);
416 	}
417 	return done ? done : err;
418 }
419 
420 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
421 				     struct io_uring_rsrc_update2 *up,
422 				     unsigned nr_args)
423 {
424 	__u32 tmp;
425 
426 	lockdep_assert_held(&ctx->uring_lock);
427 
428 	if (check_add_overflow(up->offset, nr_args, &tmp))
429 		return -EOVERFLOW;
430 
431 	switch (type) {
432 	case IORING_RSRC_FILE:
433 		return __io_sqe_files_update(ctx, up, nr_args);
434 	case IORING_RSRC_BUFFER:
435 		return __io_sqe_buffers_update(ctx, up, nr_args);
436 	}
437 	return -EINVAL;
438 }
439 
440 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
441 			     unsigned nr_args)
442 {
443 	struct io_uring_rsrc_update2 up;
444 
445 	if (!nr_args)
446 		return -EINVAL;
447 	memset(&up, 0, sizeof(up));
448 	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
449 		return -EFAULT;
450 	if (up.resv || up.resv2)
451 		return -EINVAL;
452 	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
453 }
454 
455 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
456 			    unsigned size, unsigned type)
457 {
458 	struct io_uring_rsrc_update2 up;
459 
460 	if (size != sizeof(up))
461 		return -EINVAL;
462 	if (copy_from_user(&up, arg, sizeof(up)))
463 		return -EFAULT;
464 	if (!up.nr || up.resv || up.resv2)
465 		return -EINVAL;
466 	return __io_register_rsrc_update(ctx, type, &up, up.nr);
467 }
468 
469 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
470 			    unsigned int size, unsigned int type)
471 {
472 	struct io_uring_rsrc_register rr;
473 
474 	/* keep it extendible */
475 	if (size != sizeof(rr))
476 		return -EINVAL;
477 
478 	memset(&rr, 0, sizeof(rr));
479 	if (copy_from_user(&rr, arg, size))
480 		return -EFAULT;
481 	if (!rr.nr || rr.resv2)
482 		return -EINVAL;
483 	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
484 		return -EINVAL;
485 
486 	switch (type) {
487 	case IORING_RSRC_FILE:
488 		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
489 			break;
490 		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
491 					     rr.nr, u64_to_user_ptr(rr.tags));
492 	case IORING_RSRC_BUFFER:
493 		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
494 			break;
495 		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
496 					       rr.nr, u64_to_user_ptr(rr.tags));
497 	}
498 	return -EINVAL;
499 }
500 
501 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
502 {
503 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
504 
505 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
506 		return -EINVAL;
507 	if (sqe->rw_flags || sqe->splice_fd_in)
508 		return -EINVAL;
509 
510 	up->offset = READ_ONCE(sqe->off);
511 	up->nr_args = READ_ONCE(sqe->len);
512 	if (!up->nr_args)
513 		return -EINVAL;
514 	up->arg = READ_ONCE(sqe->addr);
515 	return 0;
516 }
517 
518 static int io_files_update_with_index_alloc(struct io_kiocb *req,
519 					    unsigned int issue_flags)
520 {
521 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
522 	__s32 __user *fds = u64_to_user_ptr(up->arg);
523 	unsigned int done;
524 	struct file *file;
525 	int ret, fd;
526 
527 	if (!req->ctx->file_table.data.nr)
528 		return -ENXIO;
529 
530 	for (done = 0; done < up->nr_args; done++) {
531 		if (get_user(fd, &fds[done])) {
532 			ret = -EFAULT;
533 			break;
534 		}
535 
536 		file = fget(fd);
537 		if (!file) {
538 			ret = -EBADF;
539 			break;
540 		}
541 		ret = io_fixed_fd_install(req, issue_flags, file,
542 					  IORING_FILE_INDEX_ALLOC);
543 		if (ret < 0)
544 			break;
545 		if (put_user(ret, &fds[done])) {
546 			__io_close_fixed(req->ctx, issue_flags, ret);
547 			ret = -EFAULT;
548 			break;
549 		}
550 	}
551 
552 	if (done)
553 		return done;
554 	return ret;
555 }
556 
557 int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
558 {
559 	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
560 	struct io_ring_ctx *ctx = req->ctx;
561 	struct io_uring_rsrc_update2 up2;
562 	int ret;
563 
564 	up2.offset = up->offset;
565 	up2.data = up->arg;
566 	up2.nr = 0;
567 	up2.tags = 0;
568 	up2.resv = 0;
569 	up2.resv2 = 0;
570 
571 	if (up->offset == IORING_FILE_INDEX_ALLOC) {
572 		ret = io_files_update_with_index_alloc(req, issue_flags);
573 	} else {
574 		io_ring_submit_lock(ctx, issue_flags);
575 		ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
576 						&up2, up->nr_args);
577 		io_ring_submit_unlock(ctx, issue_flags);
578 	}
579 
580 	if (ret < 0)
581 		req_set_fail(req);
582 	io_req_set_res(req, ret, 0);
583 	return IOU_COMPLETE;
584 }
585 
586 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
587 {
588 	if (node->tag)
589 		io_post_aux_cqe(ctx, node->tag, 0, 0);
590 
591 	switch (node->type) {
592 	case IORING_RSRC_FILE:
593 		fput(io_slot_file(node));
594 		break;
595 	case IORING_RSRC_BUFFER:
596 		io_buffer_unmap(ctx, node->buf);
597 		break;
598 	default:
599 		WARN_ON_ONCE(1);
600 		break;
601 	}
602 
603 	io_cache_free(&ctx->node_cache, node);
604 }
605 
606 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
607 {
608 	if (!ctx->file_table.data.nr)
609 		return -ENXIO;
610 
611 	io_free_file_tables(ctx, &ctx->file_table);
612 	io_file_table_set_alloc_range(ctx, 0, 0);
613 	return 0;
614 }
615 
616 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
617 			  unsigned nr_args, u64 __user *tags)
618 {
619 	__s32 __user *fds = (__s32 __user *) arg;
620 	struct file *file;
621 	int fd, ret;
622 	unsigned i;
623 
624 	if (ctx->file_table.data.nr)
625 		return -EBUSY;
626 	if (!nr_args)
627 		return -EINVAL;
628 	if (nr_args > IORING_MAX_FIXED_FILES)
629 		return -EMFILE;
630 	if (nr_args > rlimit(RLIMIT_NOFILE))
631 		return -EMFILE;
632 	if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args))
633 		return -ENOMEM;
634 
635 	for (i = 0; i < nr_args; i++) {
636 		struct io_rsrc_node *node;
637 		u64 tag = 0;
638 
639 		ret = -EFAULT;
640 		if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
641 			goto fail;
642 		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
643 			goto fail;
644 		/* allow sparse sets */
645 		if (!fds || fd == -1) {
646 			ret = -EINVAL;
647 			if (tag)
648 				goto fail;
649 			continue;
650 		}
651 
652 		file = fget(fd);
653 		ret = -EBADF;
654 		if (unlikely(!file))
655 			goto fail;
656 
657 		/*
658 		 * Don't allow io_uring instances to be registered.
659 		 */
660 		if (io_is_uring_fops(file)) {
661 			fput(file);
662 			goto fail;
663 		}
664 		ret = -ENOMEM;
665 		node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
666 		if (!node) {
667 			fput(file);
668 			goto fail;
669 		}
670 		if (tag)
671 			node->tag = tag;
672 		ctx->file_table.data.nodes[i] = node;
673 		io_fixed_file_set(node, file);
674 		io_file_bitmap_set(&ctx->file_table, i);
675 	}
676 
677 	/* default it to the whole table */
678 	io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
679 	return 0;
680 fail:
681 	io_clear_table_tags(&ctx->file_table.data);
682 	io_sqe_files_unregister(ctx);
683 	return ret;
684 }
685 
686 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
687 {
688 	if (!ctx->buf_table.nr)
689 		return -ENXIO;
690 	io_rsrc_data_free(ctx, &ctx->buf_table);
691 	return 0;
692 }
693 
694 /*
695  * Undo hpage_acct_ref() calls made during io_buffer_account_pin() on failure.
696  * This operates on the pages array since imu->bvec isn't populated yet.
697  */
698 static void io_buffer_unaccount_hpages(struct io_ring_ctx *ctx,
699 				       struct page **pages, int nr_pages)
700 {
701 	struct page *seen = NULL;
702 	int i;
703 
704 	if (!ctx->user)
705 		return;
706 
707 	for (i = 0; i < nr_pages; i++) {
708 		struct page *hpage;
709 
710 		if (!PageCompound(pages[i]))
711 			continue;
712 
713 		hpage = compound_head(pages[i]);
714 		if (hpage == seen)
715 			continue;
716 		seen = hpage;
717 
718 		hpage_acct_unref(ctx, hpage);
719 		cond_resched();
720 	}
721 }
722 
723 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
724 				 int nr_pages)
725 {
726 	unsigned long acct_pages = 0;
727 	struct page *seen = NULL;
728 	int i, ret;
729 
730 	if (!ctx->user)
731 		return 0;
732 
733 	for (i = 0; i < nr_pages; i++) {
734 		struct page *hpage;
735 		bool acct_new;
736 
737 		if (!PageCompound(pages[i])) {
738 			acct_pages++;
739 			continue;
740 		}
741 
742 		hpage = compound_head(pages[i]);
743 		if (hpage == seen)
744 			continue;
745 		seen = hpage;
746 
747 		ret = hpage_acct_ref(ctx, hpage, &acct_new);
748 		if (ret) {
749 			io_buffer_unaccount_hpages(ctx, pages, i);
750 			return ret;
751 		}
752 		if (acct_new)
753 			acct_pages += page_size(hpage) >> PAGE_SHIFT;
754 		cond_resched();
755 	}
756 
757 	/* Try to account the memory */
758 	if (acct_pages) {
759 		ret = io_account_mem(ctx->user, ctx->mm_account, acct_pages);
760 		if (ret) {
761 			/* Undo the refs we just added */
762 			io_buffer_unaccount_hpages(ctx, pages, nr_pages);
763 			return ret;
764 		}
765 	}
766 
767 	return 0;
768 }
769 
770 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
771 				struct io_imu_folio_data *data)
772 {
773 	struct page **page_array = *pages, **new_array = NULL;
774 	unsigned nr_pages_left = *nr_pages;
775 	unsigned nr_folios = data->nr_folios;
776 	unsigned i, j;
777 
778 	/* Store head pages only*/
779 	new_array = kvmalloc_objs(struct page *, nr_folios);
780 	if (!new_array)
781 		return false;
782 
783 	for (i = 0, j = 0; i < nr_folios; i++) {
784 		struct page *p = compound_head(page_array[j]);
785 		struct folio *folio = page_folio(p);
786 		unsigned int nr;
787 
788 		WARN_ON_ONCE(i > 0 && p != page_array[j]);
789 
790 		nr = i ? data->nr_pages_mid : data->nr_pages_head;
791 		nr = min(nr, nr_pages_left);
792 		/* Drop all but one ref, the entire folio will remain pinned. */
793 		if (nr > 1)
794 			unpin_user_folio(folio, nr - 1);
795 		j += nr;
796 		nr_pages_left -= nr;
797 		new_array[i] = p;
798 	}
799 
800 	WARN_ON_ONCE(j != *nr_pages);
801 
802 	kvfree(page_array);
803 	*pages = new_array;
804 	*nr_pages = nr_folios;
805 	return true;
806 }
807 
808 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
809 			      struct io_imu_folio_data *data)
810 {
811 	struct folio *folio = page_folio(page_array[0]);
812 	unsigned int count = 1, nr_folios = 1;
813 	int i;
814 
815 	data->nr_pages_mid = folio_nr_pages(folio);
816 	data->folio_shift = folio_shift(folio);
817 	data->first_folio_page_idx = folio_page_idx(folio, page_array[0]);
818 
819 	/*
820 	 * Check if pages are contiguous inside a folio, and all folios have
821 	 * the same page count except for the head and tail.
822 	 */
823 	for (i = 1; i < nr_pages; i++) {
824 		if (page_folio(page_array[i]) == folio &&
825 			page_array[i] == page_array[i-1] + 1) {
826 			count++;
827 			continue;
828 		}
829 
830 		if (nr_folios == 1) {
831 			if (folio_page_idx(folio, page_array[i-1]) !=
832 				data->nr_pages_mid - 1)
833 				return false;
834 
835 			data->nr_pages_head = count;
836 		} else if (count != data->nr_pages_mid) {
837 			return false;
838 		}
839 
840 		folio = page_folio(page_array[i]);
841 		if (folio_size(folio) != (1UL << data->folio_shift) ||
842 			folio_page_idx(folio, page_array[i]) != 0)
843 			return false;
844 
845 		count = 1;
846 		nr_folios++;
847 	}
848 	if (nr_folios == 1)
849 		data->nr_pages_head = count;
850 
851 	data->nr_folios = nr_folios;
852 	return true;
853 }
854 
855 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
856 						   struct iovec *iov)
857 {
858 	struct io_mapped_ubuf *imu = NULL;
859 	struct page **pages = NULL;
860 	struct io_rsrc_node *node;
861 	unsigned long off;
862 	size_t size;
863 	int ret, nr_pages, i;
864 	struct io_imu_folio_data data;
865 	bool coalesced = false;
866 
867 	if (!iov->iov_base) {
868 		if (iov->iov_len)
869 			return ERR_PTR(-EFAULT);
870 		/* remove the buffer without installing a new one */
871 		return NULL;
872 	}
873 
874 	ret = io_validate_user_buf_range((unsigned long)iov->iov_base,
875 					 iov->iov_len);
876 	if (ret)
877 		return ERR_PTR(ret);
878 
879 	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
880 	if (!node)
881 		return ERR_PTR(-ENOMEM);
882 
883 	ret = -ENOMEM;
884 	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
885 				&nr_pages);
886 	if (IS_ERR(pages)) {
887 		ret = PTR_ERR(pages);
888 		pages = NULL;
889 		goto done;
890 	}
891 
892 	/* If it's huge page(s), try to coalesce them into fewer bvec entries */
893 	if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) {
894 		if (data.nr_pages_mid != 1)
895 			coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
896 	}
897 
898 	imu = io_alloc_imu(ctx, nr_pages);
899 	if (!imu)
900 		goto done;
901 
902 	imu->nr_bvecs = nr_pages;
903 	ret = io_buffer_account_pin(ctx, pages, nr_pages);
904 	if (ret)
905 		goto done;
906 
907 	size = iov->iov_len;
908 	/* store original address for later verification */
909 	imu->ubuf = (unsigned long) iov->iov_base;
910 	imu->len = iov->iov_len;
911 	imu->folio_shift = PAGE_SHIFT;
912 	imu->release = io_release_ubuf;
913 	imu->priv = imu;
914 	imu->flags = 0;
915 	imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
916 	if (coalesced)
917 		imu->folio_shift = data.folio_shift;
918 	refcount_set(&imu->refs, 1);
919 
920 	off = (unsigned long)iov->iov_base & ~PAGE_MASK;
921 	if (coalesced)
922 		off += data.first_folio_page_idx << PAGE_SHIFT;
923 
924 	node->buf = imu;
925 	ret = 0;
926 
927 	for (i = 0; i < nr_pages; i++) {
928 		size_t vec_len;
929 
930 		vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
931 		bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
932 		off = 0;
933 		size -= vec_len;
934 	}
935 done:
936 	if (ret) {
937 		if (imu)
938 			io_free_imu(ctx, imu);
939 		if (pages) {
940 			for (i = 0; i < nr_pages; i++)
941 				unpin_user_folio(page_folio(pages[i]), 1);
942 		}
943 		io_cache_free(&ctx->node_cache, node);
944 		node = ERR_PTR(ret);
945 	}
946 	kvfree(pages);
947 	return node;
948 }
949 
950 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
951 			    unsigned int nr_args, u64 __user *tags)
952 {
953 	struct io_rsrc_data data;
954 	struct iovec fast_iov, *iov = &fast_iov;
955 	const struct iovec __user *uvec;
956 	int i, ret;
957 
958 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
959 
960 	if (ctx->buf_table.nr)
961 		return -EBUSY;
962 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
963 		return -EINVAL;
964 	ret = io_rsrc_data_alloc(&data, nr_args);
965 	if (ret)
966 		return ret;
967 
968 	if (!arg)
969 		memset(iov, 0, sizeof(*iov));
970 
971 	for (i = 0; i < nr_args; i++) {
972 		struct io_rsrc_node *node;
973 		u64 tag = 0;
974 
975 		if (arg) {
976 			uvec = (struct iovec __user *) arg;
977 			iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx));
978 			if (IS_ERR(iov)) {
979 				ret = PTR_ERR(iov);
980 				break;
981 			}
982 			if (io_is_compat(ctx))
983 				arg += sizeof(struct compat_iovec);
984 			else
985 				arg += sizeof(struct iovec);
986 		}
987 
988 		if (tags) {
989 			if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
990 				ret = -EFAULT;
991 				break;
992 			}
993 		}
994 
995 		node = io_sqe_buffer_register(ctx, iov);
996 		if (IS_ERR(node)) {
997 			ret = PTR_ERR(node);
998 			break;
999 		}
1000 		if (tag) {
1001 			if (!node) {
1002 				ret = -EINVAL;
1003 				break;
1004 			}
1005 			node->tag = tag;
1006 		}
1007 		data.nodes[i] = node;
1008 	}
1009 
1010 	ctx->buf_table = data;
1011 	if (ret) {
1012 		io_clear_table_tags(&ctx->buf_table);
1013 		io_sqe_buffers_unregister(ctx);
1014 	}
1015 	return ret;
1016 }
1017 
1018 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
1019 			    void (*release)(void *), unsigned int index,
1020 			    unsigned int issue_flags)
1021 {
1022 	struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
1023 	struct io_rsrc_data *data = &ctx->buf_table;
1024 	struct req_iterator rq_iter;
1025 	struct io_mapped_ubuf *imu;
1026 	struct io_rsrc_node *node;
1027 	struct bio_vec bv;
1028 	unsigned int nr_bvecs = 0;
1029 	int ret = 0;
1030 
1031 	io_ring_submit_lock(ctx, issue_flags);
1032 	if (index >= data->nr) {
1033 		ret = -EINVAL;
1034 		goto unlock;
1035 	}
1036 	index = array_index_nospec(index, data->nr);
1037 
1038 	if (data->nodes[index]) {
1039 		ret = -EBUSY;
1040 		goto unlock;
1041 	}
1042 
1043 	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
1044 	if (!node) {
1045 		ret = -ENOMEM;
1046 		goto unlock;
1047 	}
1048 
1049 	/*
1050 	 * blk_rq_nr_phys_segments() may overestimate the number of bvecs
1051 	 * but avoids needing to iterate over the bvecs
1052 	 */
1053 	imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq));
1054 	if (!imu) {
1055 		io_cache_free(&ctx->node_cache, node);
1056 		ret = -ENOMEM;
1057 		goto unlock;
1058 	}
1059 
1060 	imu->ubuf = 0;
1061 	imu->len = blk_rq_bytes(rq);
1062 	imu->folio_shift = PAGE_SHIFT;
1063 	refcount_set(&imu->refs, 1);
1064 	imu->release = release;
1065 	imu->priv = rq;
1066 	imu->flags = IO_REGBUF_F_KBUF;
1067 	imu->dir = 1 << rq_data_dir(rq);
1068 
1069 	rq_for_each_bvec(bv, rq, rq_iter)
1070 		imu->bvec[nr_bvecs++] = bv;
1071 	imu->nr_bvecs = nr_bvecs;
1072 
1073 	node->buf = imu;
1074 	data->nodes[index] = node;
1075 unlock:
1076 	io_ring_submit_unlock(ctx, issue_flags);
1077 	return ret;
1078 }
1079 EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
1080 
1081 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
1082 			      unsigned int issue_flags)
1083 {
1084 	struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
1085 	struct io_rsrc_data *data = &ctx->buf_table;
1086 	struct io_rsrc_node *node;
1087 	int ret = 0;
1088 
1089 	io_ring_submit_lock(ctx, issue_flags);
1090 	if (index >= data->nr) {
1091 		ret = -EINVAL;
1092 		goto unlock;
1093 	}
1094 	index = array_index_nospec(index, data->nr);
1095 
1096 	node = data->nodes[index];
1097 	if (!node) {
1098 		ret = -EINVAL;
1099 		goto unlock;
1100 	}
1101 	if (!(node->buf->flags & IO_REGBUF_F_KBUF)) {
1102 		ret = -EBUSY;
1103 		goto unlock;
1104 	}
1105 
1106 	io_put_rsrc_node(ctx, node);
1107 	data->nodes[index] = NULL;
1108 unlock:
1109 	io_ring_submit_unlock(ctx, issue_flags);
1110 	return ret;
1111 }
1112 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
1113 
1114 static int validate_fixed_range(u64 buf_addr, size_t len,
1115 				const struct io_mapped_ubuf *imu)
1116 {
1117 	u64 buf_end;
1118 
1119 	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1120 		return -EFAULT;
1121 	/* not inside the mapped region */
1122 	if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
1123 		return -EFAULT;
1124 	if (unlikely(len > MAX_RW_COUNT))
1125 		return -EFAULT;
1126 	return 0;
1127 }
1128 
1129 static int io_import_kbuf(int ddir, struct iov_iter *iter,
1130 			  struct io_mapped_ubuf *imu, size_t len, size_t offset)
1131 {
1132 	size_t count = len + offset;
1133 
1134 	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
1135 	iov_iter_advance(iter, offset);
1136 	return 0;
1137 }
1138 
1139 static int io_import_fixed(int ddir, struct iov_iter *iter,
1140 			   struct io_mapped_ubuf *imu,
1141 			   u64 buf_addr, size_t len)
1142 {
1143 	const struct bio_vec *bvec;
1144 	size_t folio_mask;
1145 	unsigned nr_segs;
1146 	size_t offset;
1147 	int ret;
1148 
1149 	ret = validate_fixed_range(buf_addr, len, imu);
1150 	if (unlikely(ret))
1151 		return ret;
1152 	if (!(imu->dir & (1 << ddir)))
1153 		return -EFAULT;
1154 	if (unlikely(!len)) {
1155 		iov_iter_bvec(iter, ddir, NULL, 0, 0);
1156 		return 0;
1157 	}
1158 
1159 	offset = buf_addr - imu->ubuf;
1160 
1161 	if (imu->flags & IO_REGBUF_F_KBUF)
1162 		return io_import_kbuf(ddir, iter, imu, len, offset);
1163 
1164 	/*
1165 	 * Don't use iov_iter_advance() here, as it's really slow for
1166 	 * using the latter parts of a big fixed buffer - it iterates
1167 	 * over each segment manually. We can cheat a bit here for user
1168 	 * registered nodes, because we know that:
1169 	 *
1170 	 * 1) it's a BVEC iter, we set it up
1171 	 * 2) all bvecs are the same in size, except potentially the
1172 	 *    first and last bvec
1173 	 */
1174 	folio_mask = (1UL << imu->folio_shift) - 1;
1175 	bvec = imu->bvec;
1176 	if (offset >= bvec->bv_len) {
1177 		unsigned long seg_skip;
1178 
1179 		/* skip first vec */
1180 		offset -= bvec->bv_len;
1181 		seg_skip = 1 + (offset >> imu->folio_shift);
1182 		bvec += seg_skip;
1183 		offset &= folio_mask;
1184 	}
1185 	nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
1186 	iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
1187 	iter->iov_offset = offset;
1188 	return 0;
1189 }
1190 
1191 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
1192 					     unsigned issue_flags)
1193 {
1194 	struct io_ring_ctx *ctx = req->ctx;
1195 	struct io_rsrc_node *node;
1196 
1197 	if (req->flags & REQ_F_BUF_NODE)
1198 		return req->buf_node;
1199 	req->flags |= REQ_F_BUF_NODE;
1200 
1201 	io_ring_submit_lock(ctx, issue_flags);
1202 	node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
1203 	if (node) {
1204 		node->refs++;
1205 		req->buf_node = node;
1206 		io_ring_submit_unlock(ctx, issue_flags);
1207 		return node;
1208 	}
1209 	req->flags &= ~REQ_F_BUF_NODE;
1210 	io_ring_submit_unlock(ctx, issue_flags);
1211 	return NULL;
1212 }
1213 
1214 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
1215 			u64 buf_addr, size_t len, int ddir,
1216 			unsigned issue_flags)
1217 {
1218 	struct io_rsrc_node *node;
1219 
1220 	node = io_find_buf_node(req, issue_flags);
1221 	if (!node)
1222 		return -EFAULT;
1223 	return io_import_fixed(ddir, iter, node->buf, buf_addr, len);
1224 }
1225 
1226 static int io_buffer_acct_cloned_hpages(struct io_ring_ctx *ctx,
1227 					struct io_mapped_ubuf *imu)
1228 {
1229 	struct page *seen = NULL;
1230 	int i, ret = 0;
1231 
1232 	if (imu->flags & IO_REGBUF_F_KBUF || !ctx->user)
1233 		return 0;
1234 
1235 	for (i = 0; i < imu->nr_bvecs; i++) {
1236 		struct page *page = imu->bvec[i].bv_page;
1237 		struct page *hpage;
1238 		bool acct_new;
1239 
1240 		if (!PageCompound(page))
1241 			continue;
1242 
1243 		hpage = compound_head(page);
1244 		if (hpage == seen)
1245 			continue;
1246 		seen = hpage;
1247 
1248 		/* Atomically add reference for cloned buffer */
1249 		ret = hpage_acct_ref(ctx, hpage, &acct_new);
1250 		if (ret)
1251 			break;
1252 
1253 		cond_resched();
1254 	}
1255 
1256 	if (!ret)
1257 		return 0;
1258 
1259 	/* Undo refs we added for bvecs [0..i) */
1260 	seen = NULL;
1261 	for (int j = 0; j < i; j++) {
1262 		struct page *p = imu->bvec[j].bv_page;
1263 		struct page *hp;
1264 
1265 		if (!PageCompound(p))
1266 			continue;
1267 		hp = compound_head(p);
1268 		if (hp == seen)
1269 			continue;
1270 		seen = hp;
1271 		hpage_acct_unref(ctx, hp);
1272 	}
1273 	return ret;
1274 }
1275 
1276 /* Lock two rings at once. The rings must be different! */
1277 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
1278 {
1279 	if (ctx1 > ctx2)
1280 		swap(ctx1, ctx2);
1281 	mutex_lock(&ctx1->uring_lock);
1282 	mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING);
1283 }
1284 
1285 /* Both rings are locked by the caller. */
1286 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
1287 			    struct io_uring_clone_buffers *arg)
1288 {
1289 	struct io_rsrc_data data;
1290 	int i, ret, off, nr;
1291 	unsigned int nbufs;
1292 
1293 	lockdep_assert_held(&ctx->uring_lock);
1294 	lockdep_assert_held(&src_ctx->uring_lock);
1295 
1296 	/*
1297 	 * Accounting state is shared between the two rings; that only works if
1298 	 * both rings are accounted towards the same counters.
1299 	 */
1300 	if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
1301 		return -EINVAL;
1302 
1303 	/* if offsets are given, must have nr specified too */
1304 	if (!arg->nr && (arg->dst_off || arg->src_off))
1305 		return -EINVAL;
1306 	/* not allowed unless REPLACE is set */
1307 	if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
1308 		return -EBUSY;
1309 
1310 	nbufs = src_ctx->buf_table.nr;
1311 	if (!nbufs)
1312 		return -ENXIO;
1313 	if (!arg->nr)
1314 		arg->nr = nbufs;
1315 	else if (arg->nr > nbufs)
1316 		return -EINVAL;
1317 	else if (arg->nr > IORING_MAX_REG_BUFFERS)
1318 		return -EINVAL;
1319 	if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs)
1320 		return -EOVERFLOW;
1321 	if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
1322 		return -EOVERFLOW;
1323 	if (nbufs > IORING_MAX_REG_BUFFERS)
1324 		return -EINVAL;
1325 
1326 	ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
1327 	if (ret)
1328 		return ret;
1329 
1330 	/* Copy original dst nodes from before the cloned range */
1331 	for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
1332 		struct io_rsrc_node *node = ctx->buf_table.nodes[i];
1333 
1334 		if (node) {
1335 			data.nodes[i] = node;
1336 			node->refs++;
1337 		}
1338 	}
1339 
1340 	off = arg->dst_off;
1341 	i = arg->src_off;
1342 	nr = arg->nr;
1343 	while (nr--) {
1344 		struct io_rsrc_node *dst_node, *src_node;
1345 
1346 		src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
1347 		if (!src_node) {
1348 			dst_node = NULL;
1349 		} else {
1350 			dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
1351 			if (!dst_node) {
1352 				io_rsrc_data_free(ctx, &data);
1353 				return -ENOMEM;
1354 			}
1355 
1356 			refcount_inc(&src_node->buf->refs);
1357 			dst_node->buf = src_node->buf;
1358 			/* track compound references to clones */
1359 			ret = io_buffer_acct_cloned_hpages(ctx, src_node->buf);
1360 			if (ret) {
1361 				refcount_dec(&src_node->buf->refs);
1362 				io_cache_free(&ctx->node_cache, dst_node);
1363 				io_rsrc_data_free(ctx, &data);
1364 				return ret;
1365 			}
1366 		}
1367 		data.nodes[off++] = dst_node;
1368 		i++;
1369 	}
1370 
1371 	/* Copy original dst nodes from after the cloned range */
1372 	for (i = nbufs; i < ctx->buf_table.nr; i++) {
1373 		struct io_rsrc_node *node = ctx->buf_table.nodes[i];
1374 
1375 		if (node) {
1376 			data.nodes[i] = node;
1377 			node->refs++;
1378 		}
1379 	}
1380 
1381 	/*
1382 	 * If asked for replace, put the old table. data->nodes[] holds both
1383 	 * old and new nodes at this point.
1384 	 */
1385 	if (arg->flags & IORING_REGISTER_DST_REPLACE)
1386 		io_rsrc_data_free(ctx, &ctx->buf_table);
1387 
1388 	/*
1389 	 * ctx->buf_table must be empty now - either the contents are being
1390 	 * replaced and we just freed the table, or the contents are being
1391 	 * copied to a ring that does not have buffers yet (checked at function
1392 	 * entry).
1393 	 */
1394 	WARN_ON_ONCE(ctx->buf_table.nr);
1395 	ctx->buf_table = data;
1396 	return 0;
1397 }
1398 
1399 /*
1400  * Copy the registered buffers from the source ring whose file descriptor
1401  * is given in the src_fd to the current ring. This is identical to registering
1402  * the buffers with ctx, except faster as mappings already exist.
1403  *
1404  * Since the memory is already accounted once, don't account it again.
1405  */
1406 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
1407 {
1408 	struct io_uring_clone_buffers buf;
1409 	struct io_ring_ctx *src_ctx;
1410 	bool registered_src;
1411 	struct file *file;
1412 	int ret;
1413 
1414 	if (copy_from_user(&buf, arg, sizeof(buf)))
1415 		return -EFAULT;
1416 	if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
1417 		return -EINVAL;
1418 	if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
1419 		return -EBUSY;
1420 	if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
1421 		return -EINVAL;
1422 
1423 	registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
1424 	file = io_uring_ctx_get_file(buf.src_fd, registered_src);
1425 	if (IS_ERR(file))
1426 		return PTR_ERR(file);
1427 
1428 	src_ctx = file->private_data;
1429 	if (src_ctx != ctx) {
1430 		mutex_unlock(&ctx->uring_lock);
1431 		lock_two_rings(ctx, src_ctx);
1432 
1433 		if (src_ctx->submitter_task &&
1434 		    src_ctx->submitter_task != current) {
1435 			ret = -EEXIST;
1436 			goto out;
1437 		}
1438 	}
1439 
1440 	ret = io_clone_buffers(ctx, src_ctx, &buf);
1441 
1442 out:
1443 	if (src_ctx != ctx)
1444 		mutex_unlock(&src_ctx->uring_lock);
1445 
1446 	if (!registered_src)
1447 		fput(file);
1448 	return ret;
1449 }
1450 
1451 void io_vec_free(struct iou_vec *iv)
1452 {
1453 	if (!iv->iovec)
1454 		return;
1455 	kfree(iv->iovec);
1456 	iv->iovec = NULL;
1457 	iv->nr = 0;
1458 }
1459 
1460 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries)
1461 {
1462 	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_NOWARN;
1463 	struct iovec *iov;
1464 
1465 	iov = kmalloc_objs(iov[0], nr_entries, gfp);
1466 	if (!iov)
1467 		return -ENOMEM;
1468 
1469 	io_vec_free(iv);
1470 	iv->iovec = iov;
1471 	iv->nr = nr_entries;
1472 	return 0;
1473 }
1474 
1475 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
1476 				struct io_mapped_ubuf *imu,
1477 				struct iovec *iovec, unsigned nr_iovs,
1478 				struct iou_vec *vec)
1479 {
1480 	unsigned long folio_size = 1 << imu->folio_shift;
1481 	unsigned long folio_mask = folio_size - 1;
1482 	struct bio_vec *res_bvec = vec->bvec;
1483 	size_t total_len = 0;
1484 	unsigned bvec_idx = 0;
1485 	unsigned iov_idx;
1486 
1487 	for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
1488 		size_t iov_len = iovec[iov_idx].iov_len;
1489 		u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base;
1490 		struct bio_vec *src_bvec;
1491 		size_t offset;
1492 		int ret;
1493 
1494 		ret = validate_fixed_range(buf_addr, iov_len, imu);
1495 		if (unlikely(ret))
1496 			return ret;
1497 
1498 		if (unlikely(!iov_len))
1499 			return -EFAULT;
1500 		if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
1501 			return -EOVERFLOW;
1502 
1503 		offset = buf_addr - imu->ubuf;
1504 		/*
1505 		 * Only the first bvec can have non zero bv_offset, account it
1506 		 * here and work with full folios below.
1507 		 */
1508 		offset += imu->bvec[0].bv_offset;
1509 
1510 		src_bvec = imu->bvec + (offset >> imu->folio_shift);
1511 		offset &= folio_mask;
1512 
1513 		for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
1514 			size_t seg_size = min_t(size_t, iov_len,
1515 						folio_size - offset);
1516 
1517 			bvec_set_page(&res_bvec[bvec_idx],
1518 				      src_bvec->bv_page, seg_size, offset);
1519 			iov_len -= seg_size;
1520 		}
1521 	}
1522 	if (total_len > MAX_RW_COUNT)
1523 		return -EINVAL;
1524 
1525 	iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len);
1526 	return 0;
1527 }
1528 
1529 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs,
1530 				 struct io_mapped_ubuf *imu)
1531 {
1532 	unsigned shift = imu->folio_shift;
1533 	size_t max_segs = 0;
1534 	unsigned i;
1535 
1536 	for (i = 0; i < nr_iovs; i++) {
1537 		max_segs += (iov[i].iov_len >> shift) + 2;
1538 		if (max_segs > INT_MAX)
1539 			return -EOVERFLOW;
1540 	}
1541 	return max_segs;
1542 }
1543 
1544 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter,
1545 				 struct io_mapped_ubuf *imu,
1546 				 struct iovec *iovec, unsigned nr_iovs,
1547 				 struct iou_vec *vec)
1548 {
1549 	const struct bio_vec *src_bvec = imu->bvec;
1550 	struct bio_vec *res_bvec = vec->bvec;
1551 	unsigned res_idx = 0;
1552 	size_t total_len = 0;
1553 	unsigned iov_idx;
1554 
1555 	for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
1556 		size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base;
1557 		size_t iov_len = iovec[iov_idx].iov_len;
1558 		struct bvec_iter bi = {
1559 			.bi_size        = offset + iov_len,
1560 		};
1561 		struct bio_vec bv;
1562 
1563 		bvec_iter_advance(src_bvec, &bi, offset);
1564 		for_each_mp_bvec(bv, src_bvec, bi, bi)
1565 			res_bvec[res_idx++] = bv;
1566 		total_len += iov_len;
1567 	}
1568 	iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len);
1569 	return 0;
1570 }
1571 
1572 static int iov_kern_bvec_size(const struct iovec *iov,
1573 			      const struct io_mapped_ubuf *imu,
1574 			      unsigned int *nr_seg)
1575 {
1576 	size_t offset = (size_t)(uintptr_t)iov->iov_base;
1577 	const struct bio_vec *bvec = imu->bvec;
1578 	int start = 0, i = 0;
1579 	size_t off = 0;
1580 	int ret;
1581 
1582 	ret = validate_fixed_range(offset, iov->iov_len, imu);
1583 	if (unlikely(ret))
1584 		return ret;
1585 
1586 	for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs;
1587 			off += bvec[i].bv_len, i++) {
1588 		if (offset >= off && offset < off + bvec[i].bv_len)
1589 			start = i;
1590 	}
1591 	*nr_seg = i - start;
1592 	return 0;
1593 }
1594 
1595 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs,
1596 			     struct io_mapped_ubuf *imu, unsigned *nr_segs)
1597 {
1598 	unsigned max_segs = 0;
1599 	size_t total_len = 0;
1600 	unsigned i;
1601 	int ret;
1602 
1603 	*nr_segs = 0;
1604 	for (i = 0; i < nr_iovs; i++) {
1605 		if (unlikely(!iov[i].iov_len))
1606 			return -EFAULT;
1607 		if (unlikely(check_add_overflow(total_len, iov[i].iov_len,
1608 						&total_len)))
1609 			return -EOVERFLOW;
1610 		ret = iov_kern_bvec_size(&iov[i], imu, &max_segs);
1611 		if (unlikely(ret))
1612 			return ret;
1613 		*nr_segs += max_segs;
1614 	}
1615 	if (total_len > MAX_RW_COUNT)
1616 		return -EINVAL;
1617 	return 0;
1618 }
1619 
1620 int io_import_reg_vec(int ddir, struct iov_iter *iter,
1621 			struct io_kiocb *req, struct iou_vec *vec,
1622 			unsigned nr_iovs, unsigned issue_flags)
1623 {
1624 	struct io_rsrc_node *node;
1625 	struct io_mapped_ubuf *imu;
1626 	unsigned iovec_off;
1627 	struct iovec *iov;
1628 	unsigned nr_segs;
1629 
1630 	node = io_find_buf_node(req, issue_flags);
1631 	if (!node)
1632 		return -EFAULT;
1633 	imu = node->buf;
1634 	if (!(imu->dir & (1 << ddir)))
1635 		return -EFAULT;
1636 
1637 	iovec_off = vec->nr - nr_iovs;
1638 	iov = vec->iovec + iovec_off;
1639 
1640 	if (imu->flags & IO_REGBUF_F_KBUF) {
1641 		int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs);
1642 
1643 		if (unlikely(ret))
1644 			return ret;
1645 	} else {
1646 		int ret = io_estimate_bvec_size(iov, nr_iovs, imu);
1647 
1648 		if (ret < 0)
1649 			return ret;
1650 		nr_segs = ret;
1651 	}
1652 
1653 	if (sizeof(struct bio_vec) > sizeof(struct iovec)) {
1654 		size_t bvec_bytes;
1655 
1656 		bvec_bytes = nr_segs * sizeof(struct bio_vec);
1657 		nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov);
1658 		nr_segs += nr_iovs;
1659 	}
1660 
1661 	if (nr_segs > vec->nr) {
1662 		struct iou_vec tmp_vec = {};
1663 		int ret;
1664 
1665 		ret = io_vec_realloc(&tmp_vec, nr_segs);
1666 		if (ret)
1667 			return ret;
1668 
1669 		iovec_off = tmp_vec.nr - nr_iovs;
1670 		memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
1671 		io_vec_free(vec);
1672 
1673 		*vec = tmp_vec;
1674 		iov = vec->iovec + iovec_off;
1675 		req->flags |= REQ_F_NEED_CLEANUP;
1676 	}
1677 
1678 	if (imu->flags & IO_REGBUF_F_KBUF)
1679 		return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec);
1680 
1681 	return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
1682 }
1683 
1684 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv,
1685 		      const struct iovec __user *uvec, size_t uvec_segs)
1686 {
1687 	struct iovec *iov;
1688 	int iovec_off, ret;
1689 	void *res;
1690 
1691 	if (uvec_segs > iv->nr) {
1692 		ret = io_vec_realloc(iv, uvec_segs);
1693 		if (ret)
1694 			return ret;
1695 		req->flags |= REQ_F_NEED_CLEANUP;
1696 	}
1697 
1698 	/* pad iovec to the right */
1699 	iovec_off = iv->nr - uvec_segs;
1700 	iov = iv->iovec + iovec_off;
1701 	res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
1702 			      io_is_compat(req->ctx));
1703 	if (IS_ERR(res))
1704 		return PTR_ERR(res);
1705 
1706 	req->flags |= REQ_F_IMPORT_BUFFER;
1707 	return 0;
1708 }
1709