xref: /linux/io_uring/kbuf.c (revision 220374ab2be5a05dc5e35c9a5337698c942916e1)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/fs.h>
5 #include <linux/file.h>
6 #include <linux/mm.h>
7 #include <linux/slab.h>
8 #include <linux/namei.h>
9 #include <linux/poll.h>
10 #include <linux/vmalloc.h>
11 #include <linux/io_uring.h>
12 
13 #include <uapi/linux/io_uring.h>
14 
15 #include "io_uring.h"
16 #include "opdef.h"
17 #include "kbuf.h"
18 #include "memmap.h"
19 
20 /* BIDs are addressed by a 16-bit field in a CQE */
21 #define MAX_BIDS_PER_BGID (1 << 16)
22 
23 /* Mapped buffer ring, return io_uring_buf from head */
24 #define io_ring_head_to_buf(br, head, mask)	&(br)->bufs[(head) & (mask)]
25 
26 struct io_provide_buf {
27 	struct file			*file;
28 	__u64				addr;
29 	__u32				len;
30 	__u32				bgid;
31 	__u32				nbufs;
32 	__u16				bid;
33 };
34 
io_kbuf_inc_commit(struct io_buffer_list * bl,int len)35 static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len)
36 {
37 	while (len) {
38 		struct io_uring_buf *buf;
39 		u32 buf_len, this_len;
40 
41 		buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask);
42 		buf_len = READ_ONCE(buf->len);
43 		this_len = min_t(u32, len, buf_len);
44 		buf_len -= this_len;
45 		/* Stop looping for invalid buffer length of 0 */
46 		if (buf_len || !this_len) {
47 			buf->addr += this_len;
48 			buf->len = buf_len;
49 			return false;
50 		}
51 		buf->len = 0;
52 		bl->head++;
53 		len -= this_len;
54 	}
55 	return true;
56 }
57 
io_kbuf_commit(struct io_kiocb * req,struct io_buffer_list * bl,int len,int nr)58 bool io_kbuf_commit(struct io_kiocb *req,
59 		    struct io_buffer_list *bl, int len, int nr)
60 {
61 	if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT)))
62 		return true;
63 
64 	req->flags &= ~REQ_F_BUFFERS_COMMIT;
65 
66 	if (unlikely(len < 0))
67 		return true;
68 	if (bl->flags & IOBL_INC)
69 		return io_kbuf_inc_commit(bl, len);
70 	bl->head += nr;
71 	return true;
72 }
73 
io_buffer_get_list(struct io_ring_ctx * ctx,unsigned int bgid)74 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
75 							unsigned int bgid)
76 {
77 	lockdep_assert_held(&ctx->uring_lock);
78 
79 	return xa_load(&ctx->io_bl_xa, bgid);
80 }
81 
io_buffer_add_list(struct io_ring_ctx * ctx,struct io_buffer_list * bl,unsigned int bgid)82 static int io_buffer_add_list(struct io_ring_ctx *ctx,
83 			      struct io_buffer_list *bl, unsigned int bgid)
84 {
85 	/*
86 	 * Store buffer group ID and finally mark the list as visible.
87 	 * The normal lookup doesn't care about the visibility as we're
88 	 * always under the ->uring_lock, but lookups from mmap do.
89 	 */
90 	bl->bgid = bgid;
91 	guard(mutex)(&ctx->mmap_lock);
92 	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
93 }
94 
io_kbuf_drop_legacy(struct io_kiocb * req)95 void io_kbuf_drop_legacy(struct io_kiocb *req)
96 {
97 	if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED)))
98 		return;
99 	req->flags &= ~REQ_F_BUFFER_SELECTED;
100 	kfree(req->kbuf);
101 	req->kbuf = NULL;
102 }
103 
io_kbuf_recycle_legacy(struct io_kiocb * req,unsigned issue_flags)104 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
105 {
106 	struct io_ring_ctx *ctx = req->ctx;
107 	struct io_buffer_list *bl;
108 	struct io_buffer *buf;
109 
110 	io_ring_submit_lock(ctx, issue_flags);
111 
112 	buf = req->kbuf;
113 	bl = io_buffer_get_list(ctx, buf->bgid);
114 	list_add(&buf->list, &bl->buf_list);
115 	bl->nbufs++;
116 	req->flags &= ~REQ_F_BUFFER_SELECTED;
117 
118 	io_ring_submit_unlock(ctx, issue_flags);
119 	return true;
120 }
121 
io_provided_buffer_select(struct io_kiocb * req,size_t * len,struct io_buffer_list * bl)122 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
123 					      struct io_buffer_list *bl)
124 {
125 	if (!list_empty(&bl->buf_list)) {
126 		struct io_buffer *kbuf;
127 
128 		kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
129 		list_del(&kbuf->list);
130 		bl->nbufs--;
131 		if (*len == 0 || *len > kbuf->len)
132 			*len = kbuf->len;
133 		if (list_empty(&bl->buf_list))
134 			req->flags |= REQ_F_BL_EMPTY;
135 		req->flags |= REQ_F_BUFFER_SELECTED;
136 		req->kbuf = kbuf;
137 		req->buf_index = kbuf->bid;
138 		return u64_to_user_ptr(kbuf->addr);
139 	}
140 	return NULL;
141 }
142 
io_provided_buffers_select(struct io_kiocb * req,size_t * len,struct io_buffer_list * bl,struct iovec * iov)143 static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
144 				      struct io_buffer_list *bl,
145 				      struct iovec *iov)
146 {
147 	void __user *buf;
148 
149 	buf = io_provided_buffer_select(req, len, bl);
150 	if (unlikely(!buf))
151 		return -ENOBUFS;
152 
153 	iov[0].iov_base = buf;
154 	iov[0].iov_len = *len;
155 	return 1;
156 }
157 
io_ring_buffer_select(struct io_kiocb * req,size_t * len,struct io_buffer_list * bl,unsigned int issue_flags)158 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
159 					  struct io_buffer_list *bl,
160 					  unsigned int issue_flags)
161 {
162 	struct io_uring_buf_ring *br = bl->buf_ring;
163 	__u16 tail, head = bl->head;
164 	struct io_uring_buf *buf;
165 	void __user *ret;
166 	u32 buf_len;
167 
168 	tail = smp_load_acquire(&br->tail);
169 	if (unlikely(tail == head))
170 		return NULL;
171 
172 	if (head + 1 == tail)
173 		req->flags |= REQ_F_BL_EMPTY;
174 
175 	buf = io_ring_head_to_buf(br, head, bl->mask);
176 	buf_len = READ_ONCE(buf->len);
177 	if (*len == 0 || *len > buf_len)
178 		*len = buf_len;
179 	req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
180 	req->buf_list = bl;
181 	req->buf_index = buf->bid;
182 	ret = u64_to_user_ptr(buf->addr);
183 
184 	if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) {
185 		/*
186 		 * If we came in unlocked, we have no choice but to consume the
187 		 * buffer here, otherwise nothing ensures that the buffer won't
188 		 * get used by others. This does mean it'll be pinned until the
189 		 * IO completes, coming in unlocked means we're being called from
190 		 * io-wq context and there may be further retries in async hybrid
191 		 * mode. For the locked case, the caller must call commit when
192 		 * the transfer completes (or if we get -EAGAIN and must poll of
193 		 * retry).
194 		 */
195 		io_kbuf_commit(req, bl, *len, 1);
196 		req->buf_list = NULL;
197 	}
198 	return ret;
199 }
200 
io_buffer_select(struct io_kiocb * req,size_t * len,unsigned buf_group,unsigned int issue_flags)201 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
202 			      unsigned buf_group, unsigned int issue_flags)
203 {
204 	struct io_ring_ctx *ctx = req->ctx;
205 	struct io_buffer_list *bl;
206 	void __user *ret = NULL;
207 
208 	io_ring_submit_lock(req->ctx, issue_flags);
209 
210 	bl = io_buffer_get_list(ctx, buf_group);
211 	if (likely(bl)) {
212 		if (bl->flags & IOBL_BUF_RING)
213 			ret = io_ring_buffer_select(req, len, bl, issue_flags);
214 		else
215 			ret = io_provided_buffer_select(req, len, bl);
216 	}
217 	io_ring_submit_unlock(req->ctx, issue_flags);
218 	return ret;
219 }
220 
221 /* cap it at a reasonable 256, will be one page even for 4K */
222 #define PEEK_MAX_IMPORT		256
223 
io_ring_buffers_peek(struct io_kiocb * req,struct buf_sel_arg * arg,struct io_buffer_list * bl)224 static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
225 				struct io_buffer_list *bl)
226 {
227 	struct io_uring_buf_ring *br = bl->buf_ring;
228 	struct iovec *iov = arg->iovs;
229 	int nr_iovs = arg->nr_iovs;
230 	__u16 nr_avail, tail, head;
231 	struct io_uring_buf *buf;
232 
233 	tail = smp_load_acquire(&br->tail);
234 	head = bl->head;
235 	nr_avail = min_t(__u16, tail - head, UIO_MAXIOV);
236 	if (unlikely(!nr_avail))
237 		return -ENOBUFS;
238 
239 	buf = io_ring_head_to_buf(br, head, bl->mask);
240 	if (arg->max_len) {
241 		u32 len = READ_ONCE(buf->len);
242 		size_t needed;
243 
244 		if (unlikely(!len))
245 			return -ENOBUFS;
246 		needed = (arg->max_len + len - 1) / len;
247 		needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
248 		if (nr_avail > needed)
249 			nr_avail = needed;
250 	}
251 
252 	/*
253 	 * only alloc a bigger array if we know we have data to map, eg not
254 	 * a speculative peek operation.
255 	 */
256 	if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) {
257 		iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL);
258 		if (unlikely(!iov))
259 			return -ENOMEM;
260 		if (arg->mode & KBUF_MODE_FREE)
261 			kfree(arg->iovs);
262 		arg->iovs = iov;
263 		nr_iovs = nr_avail;
264 	} else if (nr_avail < nr_iovs) {
265 		nr_iovs = nr_avail;
266 	}
267 
268 	/* set it to max, if not set, so we can use it unconditionally */
269 	if (!arg->max_len)
270 		arg->max_len = INT_MAX;
271 
272 	req->buf_index = buf->bid;
273 	do {
274 		u32 len = READ_ONCE(buf->len);
275 
276 		/* truncate end piece, if needed, for non partial buffers */
277 		if (len > arg->max_len) {
278 			len = arg->max_len;
279 			if (!(bl->flags & IOBL_INC)) {
280 				arg->partial_map = 1;
281 				if (iov != arg->iovs)
282 					break;
283 				buf->len = len;
284 			}
285 		}
286 
287 		iov->iov_base = u64_to_user_ptr(buf->addr);
288 		iov->iov_len = len;
289 		iov++;
290 
291 		arg->out_len += len;
292 		arg->max_len -= len;
293 		if (!arg->max_len)
294 			break;
295 
296 		buf = io_ring_head_to_buf(br, ++head, bl->mask);
297 	} while (--nr_iovs);
298 
299 	if (head == tail)
300 		req->flags |= REQ_F_BL_EMPTY;
301 
302 	req->flags |= REQ_F_BUFFER_RING;
303 	req->buf_list = bl;
304 	return iov - arg->iovs;
305 }
306 
io_buffers_select(struct io_kiocb * req,struct buf_sel_arg * arg,unsigned int issue_flags)307 int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
308 		      unsigned int issue_flags)
309 {
310 	struct io_ring_ctx *ctx = req->ctx;
311 	struct io_buffer_list *bl;
312 	int ret = -ENOENT;
313 
314 	io_ring_submit_lock(ctx, issue_flags);
315 	bl = io_buffer_get_list(ctx, arg->buf_group);
316 	if (unlikely(!bl))
317 		goto out_unlock;
318 
319 	if (bl->flags & IOBL_BUF_RING) {
320 		ret = io_ring_buffers_peek(req, arg, bl);
321 		/*
322 		 * Don't recycle these buffers if we need to go through poll.
323 		 * Nobody else can use them anyway, and holding on to provided
324 		 * buffers for a send/write operation would happen on the app
325 		 * side anyway with normal buffers. Besides, we already
326 		 * committed them, they cannot be put back in the queue.
327 		 */
328 		if (ret > 0) {
329 			req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
330 			io_kbuf_commit(req, bl, arg->out_len, ret);
331 		}
332 	} else {
333 		ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
334 	}
335 out_unlock:
336 	io_ring_submit_unlock(ctx, issue_flags);
337 	return ret;
338 }
339 
io_buffers_peek(struct io_kiocb * req,struct buf_sel_arg * arg)340 int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
341 {
342 	struct io_ring_ctx *ctx = req->ctx;
343 	struct io_buffer_list *bl;
344 	int ret;
345 
346 	lockdep_assert_held(&ctx->uring_lock);
347 
348 	bl = io_buffer_get_list(ctx, arg->buf_group);
349 	if (unlikely(!bl))
350 		return -ENOENT;
351 
352 	if (bl->flags & IOBL_BUF_RING) {
353 		ret = io_ring_buffers_peek(req, arg, bl);
354 		if (ret > 0)
355 			req->flags |= REQ_F_BUFFERS_COMMIT;
356 		return ret;
357 	}
358 
359 	/* don't support multiple buffer selections for legacy */
360 	return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs);
361 }
362 
__io_put_kbuf_ring(struct io_kiocb * req,int len,int nr)363 static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
364 {
365 	struct io_buffer_list *bl = req->buf_list;
366 	bool ret = true;
367 
368 	if (bl)
369 		ret = io_kbuf_commit(req, bl, len, nr);
370 
371 	req->flags &= ~REQ_F_BUFFER_RING;
372 	return ret;
373 }
374 
__io_put_kbufs(struct io_kiocb * req,int len,int nbufs)375 unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs)
376 {
377 	unsigned int ret;
378 
379 	ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
380 
381 	if (unlikely(!(req->flags & REQ_F_BUFFER_RING))) {
382 		io_kbuf_drop_legacy(req);
383 		return ret;
384 	}
385 
386 	if (!__io_put_kbuf_ring(req, len, nbufs))
387 		ret |= IORING_CQE_F_BUF_MORE;
388 	return ret;
389 }
390 
io_remove_buffers_legacy(struct io_ring_ctx * ctx,struct io_buffer_list * bl,unsigned long nbufs)391 static int io_remove_buffers_legacy(struct io_ring_ctx *ctx,
392 				    struct io_buffer_list *bl,
393 				    unsigned long nbufs)
394 {
395 	unsigned long i = 0;
396 	struct io_buffer *nxt;
397 
398 	/* protects io_buffers_cache */
399 	lockdep_assert_held(&ctx->uring_lock);
400 	WARN_ON_ONCE(bl->flags & IOBL_BUF_RING);
401 
402 	for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) {
403 		nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
404 		list_del(&nxt->list);
405 		bl->nbufs--;
406 		kfree(nxt);
407 		cond_resched();
408 	}
409 	return i;
410 }
411 
io_put_bl(struct io_ring_ctx * ctx,struct io_buffer_list * bl)412 static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
413 {
414 	if (bl->flags & IOBL_BUF_RING)
415 		io_free_region(ctx, &bl->region);
416 	else
417 		io_remove_buffers_legacy(ctx, bl, -1U);
418 
419 	kfree(bl);
420 }
421 
io_destroy_buffers(struct io_ring_ctx * ctx)422 void io_destroy_buffers(struct io_ring_ctx *ctx)
423 {
424 	struct io_buffer_list *bl;
425 
426 	while (1) {
427 		unsigned long index = 0;
428 
429 		scoped_guard(mutex, &ctx->mmap_lock) {
430 			bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT);
431 			if (bl)
432 				xa_erase(&ctx->io_bl_xa, bl->bgid);
433 		}
434 		if (!bl)
435 			break;
436 		io_put_bl(ctx, bl);
437 	}
438 }
439 
io_destroy_bl(struct io_ring_ctx * ctx,struct io_buffer_list * bl)440 static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
441 {
442 	scoped_guard(mutex, &ctx->mmap_lock)
443 		WARN_ON_ONCE(xa_erase(&ctx->io_bl_xa, bl->bgid) != bl);
444 	io_put_bl(ctx, bl);
445 }
446 
io_remove_buffers_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)447 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
448 {
449 	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
450 	u64 tmp;
451 
452 	if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
453 	    sqe->splice_fd_in)
454 		return -EINVAL;
455 
456 	tmp = READ_ONCE(sqe->fd);
457 	if (!tmp || tmp > MAX_BIDS_PER_BGID)
458 		return -EINVAL;
459 
460 	memset(p, 0, sizeof(*p));
461 	p->nbufs = tmp;
462 	p->bgid = READ_ONCE(sqe->buf_group);
463 	return 0;
464 }
465 
io_provide_buffers_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)466 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
467 {
468 	unsigned long size, tmp_check;
469 	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
470 	u64 tmp;
471 
472 	if (sqe->rw_flags || sqe->splice_fd_in)
473 		return -EINVAL;
474 
475 	tmp = READ_ONCE(sqe->fd);
476 	if (!tmp || tmp > MAX_BIDS_PER_BGID)
477 		return -E2BIG;
478 	p->nbufs = tmp;
479 	p->addr = READ_ONCE(sqe->addr);
480 	p->len = READ_ONCE(sqe->len);
481 	if (!p->len)
482 		return -EINVAL;
483 
484 	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
485 				&size))
486 		return -EOVERFLOW;
487 	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
488 		return -EOVERFLOW;
489 	if (!access_ok(u64_to_user_ptr(p->addr), size))
490 		return -EFAULT;
491 
492 	p->bgid = READ_ONCE(sqe->buf_group);
493 	tmp = READ_ONCE(sqe->off);
494 	if (tmp > USHRT_MAX)
495 		return -E2BIG;
496 	if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
497 		return -EINVAL;
498 	p->bid = tmp;
499 	return 0;
500 }
501 
io_add_buffers(struct io_ring_ctx * ctx,struct io_provide_buf * pbuf,struct io_buffer_list * bl)502 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
503 			  struct io_buffer_list *bl)
504 {
505 	struct io_buffer *buf;
506 	u64 addr = pbuf->addr;
507 	int ret = -ENOMEM, i, bid = pbuf->bid;
508 
509 	for (i = 0; i < pbuf->nbufs; i++) {
510 		/*
511 		 * Nonsensical to have more than sizeof(bid) buffers in a
512 		 * buffer list, as the application then has no way of knowing
513 		 * which duplicate bid refers to what buffer.
514 		 */
515 		if (bl->nbufs == USHRT_MAX) {
516 			ret = -EOVERFLOW;
517 			break;
518 		}
519 		buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
520 		if (!buf)
521 			break;
522 
523 		list_add_tail(&buf->list, &bl->buf_list);
524 		bl->nbufs++;
525 		buf->addr = addr;
526 		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
527 		buf->bid = bid;
528 		buf->bgid = pbuf->bgid;
529 		addr += pbuf->len;
530 		bid++;
531 		cond_resched();
532 	}
533 
534 	return i ? 0 : ret;
535 }
536 
__io_manage_buffers_legacy(struct io_kiocb * req,struct io_buffer_list * bl)537 static int __io_manage_buffers_legacy(struct io_kiocb *req,
538 					struct io_buffer_list *bl)
539 {
540 	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
541 	int ret;
542 
543 	if (!bl) {
544 		if (req->opcode != IORING_OP_PROVIDE_BUFFERS)
545 			return -ENOENT;
546 		bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
547 		if (!bl)
548 			return -ENOMEM;
549 
550 		INIT_LIST_HEAD(&bl->buf_list);
551 		ret = io_buffer_add_list(req->ctx, bl, p->bgid);
552 		if (ret) {
553 			kfree(bl);
554 			return ret;
555 		}
556 	}
557 	/* can't use provide/remove buffers command on mapped buffers */
558 	if (bl->flags & IOBL_BUF_RING)
559 		return -EINVAL;
560 	if (req->opcode == IORING_OP_PROVIDE_BUFFERS)
561 		return io_add_buffers(req->ctx, p, bl);
562 	return io_remove_buffers_legacy(req->ctx, bl, p->nbufs);
563 }
564 
io_manage_buffers_legacy(struct io_kiocb * req,unsigned int issue_flags)565 int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags)
566 {
567 	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
568 	struct io_ring_ctx *ctx = req->ctx;
569 	struct io_buffer_list *bl;
570 	int ret;
571 
572 	io_ring_submit_lock(ctx, issue_flags);
573 	bl = io_buffer_get_list(ctx, p->bgid);
574 	ret = __io_manage_buffers_legacy(req, bl);
575 	io_ring_submit_unlock(ctx, issue_flags);
576 
577 	if (ret < 0)
578 		req_set_fail(req);
579 	io_req_set_res(req, ret, 0);
580 	return IOU_COMPLETE;
581 }
582 
io_register_pbuf_ring(struct io_ring_ctx * ctx,void __user * arg)583 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
584 {
585 	struct io_uring_buf_reg reg;
586 	struct io_buffer_list *bl;
587 	struct io_uring_region_desc rd;
588 	struct io_uring_buf_ring *br;
589 	unsigned long mmap_offset;
590 	unsigned long ring_size;
591 	int ret;
592 
593 	lockdep_assert_held(&ctx->uring_lock);
594 
595 	if (copy_from_user(&reg, arg, sizeof(reg)))
596 		return -EFAULT;
597 	if (!mem_is_zero(reg.resv, sizeof(reg.resv)))
598 		return -EINVAL;
599 	if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
600 		return -EINVAL;
601 	if (!is_power_of_2(reg.ring_entries))
602 		return -EINVAL;
603 	/* cannot disambiguate full vs empty due to head/tail size */
604 	if (reg.ring_entries >= 65536)
605 		return -EINVAL;
606 
607 	bl = io_buffer_get_list(ctx, reg.bgid);
608 	if (bl) {
609 		/* if mapped buffer ring OR classic exists, don't allow */
610 		if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list))
611 			return -EEXIST;
612 		io_destroy_bl(ctx, bl);
613 	}
614 
615 	bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
616 	if (!bl)
617 		return -ENOMEM;
618 
619 	mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT;
620 	ring_size = flex_array_size(br, bufs, reg.ring_entries);
621 
622 	memset(&rd, 0, sizeof(rd));
623 	rd.size = PAGE_ALIGN(ring_size);
624 	if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
625 		rd.user_addr = reg.ring_addr;
626 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
627 	}
628 	ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset);
629 	if (ret)
630 		goto fail;
631 	br = io_region_get_ptr(&bl->region);
632 
633 #ifdef SHM_COLOUR
634 	/*
635 	 * On platforms that have specific aliasing requirements, SHM_COLOUR
636 	 * is set and we must guarantee that the kernel and user side align
637 	 * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
638 	 * the application mmap's the provided ring buffer. Fail the request
639 	 * if we, by chance, don't end up with aligned addresses. The app
640 	 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
641 	 * this transparently.
642 	 */
643 	if (!(reg.flags & IOU_PBUF_RING_MMAP) &&
644 	    ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) {
645 		ret = -EINVAL;
646 		goto fail;
647 	}
648 #endif
649 
650 	bl->nr_entries = reg.ring_entries;
651 	bl->mask = reg.ring_entries - 1;
652 	bl->flags |= IOBL_BUF_RING;
653 	bl->buf_ring = br;
654 	if (reg.flags & IOU_PBUF_RING_INC)
655 		bl->flags |= IOBL_INC;
656 	io_buffer_add_list(ctx, bl, reg.bgid);
657 	return 0;
658 fail:
659 	io_free_region(ctx, &bl->region);
660 	kfree(bl);
661 	return ret;
662 }
663 
io_unregister_pbuf_ring(struct io_ring_ctx * ctx,void __user * arg)664 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
665 {
666 	struct io_uring_buf_reg reg;
667 	struct io_buffer_list *bl;
668 
669 	lockdep_assert_held(&ctx->uring_lock);
670 
671 	if (copy_from_user(&reg, arg, sizeof(reg)))
672 		return -EFAULT;
673 	if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags)
674 		return -EINVAL;
675 
676 	bl = io_buffer_get_list(ctx, reg.bgid);
677 	if (!bl)
678 		return -ENOENT;
679 	if (!(bl->flags & IOBL_BUF_RING))
680 		return -EINVAL;
681 
682 	scoped_guard(mutex, &ctx->mmap_lock)
683 		xa_erase(&ctx->io_bl_xa, bl->bgid);
684 
685 	io_put_bl(ctx, bl);
686 	return 0;
687 }
688 
io_register_pbuf_status(struct io_ring_ctx * ctx,void __user * arg)689 int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
690 {
691 	struct io_uring_buf_status buf_status;
692 	struct io_buffer_list *bl;
693 
694 	if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
695 		return -EFAULT;
696 	if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv)))
697 		return -EINVAL;
698 
699 	bl = io_buffer_get_list(ctx, buf_status.buf_group);
700 	if (!bl)
701 		return -ENOENT;
702 	if (!(bl->flags & IOBL_BUF_RING))
703 		return -EINVAL;
704 
705 	buf_status.head = bl->head;
706 	if (copy_to_user(arg, &buf_status, sizeof(buf_status)))
707 		return -EFAULT;
708 
709 	return 0;
710 }
711 
io_pbuf_get_region(struct io_ring_ctx * ctx,unsigned int bgid)712 struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
713 					    unsigned int bgid)
714 {
715 	struct io_buffer_list *bl;
716 
717 	lockdep_assert_held(&ctx->mmap_lock);
718 
719 	bl = xa_load(&ctx->io_bl_xa, bgid);
720 	if (!bl || !(bl->flags & IOBL_BUF_RING))
721 		return NULL;
722 	return &bl->region;
723 }
724