xref: /linux/io_uring/kbuf.c (revision e67bf352a0847a65a157d5b02a6024c65a781e08)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/fs.h>
5 #include <linux/file.h>
6 #include <linux/mm.h>
7 #include <linux/slab.h>
8 #include <linux/namei.h>
9 #include <linux/poll.h>
10 #include <linux/vmalloc.h>
11 #include <linux/io_uring.h>
12 
13 #include <uapi/linux/io_uring.h>
14 
15 #include "io_uring.h"
16 #include "opdef.h"
17 #include "kbuf.h"
18 #include "memmap.h"
19 
20 /* BIDs are addressed by a 16-bit field in a CQE */
21 #define MAX_BIDS_PER_BGID (1 << 16)
22 
23 /* Mapped buffer ring, return io_uring_buf from head */
24 #define io_ring_head_to_buf(br, head, mask)	&(br)->bufs[(head) & (mask)]
25 
26 struct io_provide_buf {
27 	struct file			*file;
28 	__u64				addr;
29 	__u32				len;
30 	__u32				bgid;
31 	__u32				nbufs;
32 	__u16				bid;
33 };
34 
35 static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len)
36 {
37 	while (len) {
38 		struct io_uring_buf *buf;
39 		u32 buf_len, this_len;
40 
41 		buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask);
42 		buf_len = READ_ONCE(buf->len);
43 		this_len = min_t(u32, len, buf_len);
44 		buf_len -= this_len;
45 		/* Stop looping for invalid buffer length of 0 */
46 		if (buf_len || !this_len) {
47 			WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len);
48 			WRITE_ONCE(buf->len, buf_len);
49 			return false;
50 		}
51 		WRITE_ONCE(buf->len, 0);
52 		bl->head++;
53 		len -= this_len;
54 	}
55 	return true;
56 }
57 
58 bool io_kbuf_commit(struct io_kiocb *req,
59 		    struct io_buffer_list *bl, int len, int nr)
60 {
61 	if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT)))
62 		return true;
63 
64 	req->flags &= ~REQ_F_BUFFERS_COMMIT;
65 
66 	if (unlikely(len < 0))
67 		return true;
68 	if (bl->flags & IOBL_INC)
69 		return io_kbuf_inc_commit(bl, len);
70 	bl->head += nr;
71 	return true;
72 }
73 
74 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
75 							unsigned int bgid)
76 {
77 	lockdep_assert_held(&ctx->uring_lock);
78 
79 	return xa_load(&ctx->io_bl_xa, bgid);
80 }
81 
82 static int io_buffer_add_list(struct io_ring_ctx *ctx,
83 			      struct io_buffer_list *bl, unsigned int bgid)
84 {
85 	/*
86 	 * Store buffer group ID and finally mark the list as visible.
87 	 * The normal lookup doesn't care about the visibility as we're
88 	 * always under the ->uring_lock, but lookups from mmap do.
89 	 */
90 	bl->bgid = bgid;
91 	guard(mutex)(&ctx->mmap_lock);
92 	return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
93 }
94 
95 void io_kbuf_drop_legacy(struct io_kiocb *req)
96 {
97 	if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED)))
98 		return;
99 	req->flags &= ~REQ_F_BUFFER_SELECTED;
100 	kfree(req->kbuf);
101 	req->kbuf = NULL;
102 }
103 
104 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
105 {
106 	struct io_ring_ctx *ctx = req->ctx;
107 	struct io_buffer_list *bl;
108 	struct io_buffer *buf;
109 
110 	io_ring_submit_lock(ctx, issue_flags);
111 
112 	buf = req->kbuf;
113 	bl = io_buffer_get_list(ctx, buf->bgid);
114 	/*
115 	 * If the buffer list was upgraded to a ring-based one, or removed,
116 	 * while the request was in-flight in io-wq, drop it.
117 	 */
118 	if (bl && !(bl->flags & IOBL_BUF_RING)) {
119 		list_add(&buf->list, &bl->buf_list);
120 		bl->nbufs++;
121 	} else {
122 		kfree(buf);
123 	}
124 	req->flags &= ~REQ_F_BUFFER_SELECTED;
125 	req->kbuf = NULL;
126 
127 	io_ring_submit_unlock(ctx, issue_flags);
128 	return true;
129 }
130 
131 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
132 					      struct io_buffer_list *bl)
133 {
134 	if (!list_empty(&bl->buf_list)) {
135 		struct io_buffer *kbuf;
136 
137 		kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
138 		list_del(&kbuf->list);
139 		bl->nbufs--;
140 		if (*len == 0 || *len > kbuf->len)
141 			*len = kbuf->len;
142 		if (list_empty(&bl->buf_list))
143 			req->flags |= REQ_F_BL_EMPTY;
144 		req->flags |= REQ_F_BUFFER_SELECTED;
145 		req->kbuf = kbuf;
146 		req->buf_index = kbuf->bid;
147 		return u64_to_user_ptr(kbuf->addr);
148 	}
149 	return NULL;
150 }
151 
152 static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
153 				      struct io_buffer_list *bl,
154 				      struct iovec *iov)
155 {
156 	void __user *buf;
157 
158 	buf = io_provided_buffer_select(req, len, bl);
159 	if (unlikely(!buf))
160 		return -ENOBUFS;
161 
162 	iov[0].iov_base = buf;
163 	iov[0].iov_len = *len;
164 	return 1;
165 }
166 
167 static bool io_should_commit(struct io_kiocb *req, unsigned int issue_flags)
168 {
169 	/*
170 	* If we came in unlocked, we have no choice but to consume the
171 	* buffer here, otherwise nothing ensures that the buffer won't
172 	* get used by others. This does mean it'll be pinned until the
173 	* IO completes, coming in unlocked means we're being called from
174 	* io-wq context and there may be further retries in async hybrid
175 	* mode. For the locked case, the caller must call commit when
176 	* the transfer completes (or if we get -EAGAIN and must poll of
177 	* retry).
178 	*/
179 	if (issue_flags & IO_URING_F_UNLOCKED)
180 		return true;
181 
182 	/* uring_cmd commits kbuf upfront, no need to auto-commit */
183 	if (!io_file_can_poll(req) && !io_is_uring_cmd(req))
184 		return true;
185 	return false;
186 }
187 
188 static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
189 					      struct io_buffer_list *bl,
190 					      unsigned int issue_flags)
191 {
192 	struct io_uring_buf_ring *br = bl->buf_ring;
193 	__u16 tail, head = bl->head;
194 	struct io_br_sel sel = { };
195 	struct io_uring_buf *buf;
196 	u32 buf_len;
197 
198 	tail = smp_load_acquire(&br->tail);
199 	if (unlikely(tail == head))
200 		return sel;
201 
202 	if (head + 1 == tail)
203 		req->flags |= REQ_F_BL_EMPTY;
204 
205 	buf = io_ring_head_to_buf(br, head, bl->mask);
206 	buf_len = READ_ONCE(buf->len);
207 	if (*len == 0 || *len > buf_len)
208 		*len = buf_len;
209 	req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
210 	req->buf_index = READ_ONCE(buf->bid);
211 	sel.buf_list = bl;
212 	sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr));
213 
214 	if (io_should_commit(req, issue_flags)) {
215 		io_kbuf_commit(req, sel.buf_list, *len, 1);
216 		sel.buf_list = NULL;
217 	}
218 	return sel;
219 }
220 
221 struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
222 				  unsigned buf_group, unsigned int issue_flags)
223 {
224 	struct io_ring_ctx *ctx = req->ctx;
225 	struct io_br_sel sel = { };
226 	struct io_buffer_list *bl;
227 
228 	io_ring_submit_lock(req->ctx, issue_flags);
229 
230 	bl = io_buffer_get_list(ctx, buf_group);
231 	if (likely(bl)) {
232 		if (bl->flags & IOBL_BUF_RING)
233 			sel = io_ring_buffer_select(req, len, bl, issue_flags);
234 		else
235 			sel.addr = io_provided_buffer_select(req, len, bl);
236 	}
237 	io_ring_submit_unlock(req->ctx, issue_flags);
238 	return sel;
239 }
240 
241 /* cap it at a reasonable 256, will be one page even for 4K */
242 #define PEEK_MAX_IMPORT		256
243 
244 static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
245 				struct io_buffer_list *bl)
246 {
247 	struct io_uring_buf_ring *br = bl->buf_ring;
248 	struct iovec *iov = arg->iovs;
249 	int nr_iovs = arg->nr_iovs;
250 	__u16 nr_avail, tail, head;
251 	struct io_uring_buf *buf;
252 
253 	tail = smp_load_acquire(&br->tail);
254 	head = bl->head;
255 	nr_avail = min_t(__u16, tail - head, UIO_MAXIOV);
256 	if (unlikely(!nr_avail))
257 		return -ENOBUFS;
258 
259 	buf = io_ring_head_to_buf(br, head, bl->mask);
260 	if (arg->max_len) {
261 		u32 len = READ_ONCE(buf->len);
262 		size_t needed;
263 
264 		if (unlikely(!len))
265 			return -ENOBUFS;
266 		needed = (arg->max_len + len - 1) / len;
267 		needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
268 		if (nr_avail > needed)
269 			nr_avail = needed;
270 	}
271 
272 	/*
273 	 * only alloc a bigger array if we know we have data to map, eg not
274 	 * a speculative peek operation.
275 	 */
276 	if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) {
277 		iov = kmalloc_objs(struct iovec, nr_avail);
278 		if (unlikely(!iov))
279 			return -ENOMEM;
280 		if (arg->mode & KBUF_MODE_FREE)
281 			kfree(arg->iovs);
282 		arg->iovs = iov;
283 		nr_iovs = nr_avail;
284 	} else if (nr_avail < nr_iovs) {
285 		nr_iovs = nr_avail;
286 	}
287 
288 	/* set it to max, if not set, so we can use it unconditionally */
289 	if (!arg->max_len)
290 		arg->max_len = INT_MAX;
291 
292 	req->buf_index = READ_ONCE(buf->bid);
293 	do {
294 		u32 len = READ_ONCE(buf->len);
295 
296 		/* truncate end piece, if needed, for non partial buffers */
297 		if (len > arg->max_len) {
298 			len = arg->max_len;
299 			if (!(bl->flags & IOBL_INC)) {
300 				arg->partial_map = 1;
301 				if (iov != arg->iovs)
302 					break;
303 				WRITE_ONCE(buf->len, len);
304 			}
305 		}
306 
307 		iov->iov_base = u64_to_user_ptr(READ_ONCE(buf->addr));
308 		iov->iov_len = len;
309 		iov++;
310 
311 		arg->out_len += len;
312 		arg->max_len -= len;
313 		if (!arg->max_len)
314 			break;
315 
316 		buf = io_ring_head_to_buf(br, ++head, bl->mask);
317 	} while (--nr_iovs);
318 
319 	if (head == tail)
320 		req->flags |= REQ_F_BL_EMPTY;
321 
322 	req->flags |= REQ_F_BUFFER_RING;
323 	return iov - arg->iovs;
324 }
325 
326 int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
327 		      struct io_br_sel *sel, unsigned int issue_flags)
328 {
329 	struct io_ring_ctx *ctx = req->ctx;
330 	int ret = -ENOENT;
331 
332 	io_ring_submit_lock(ctx, issue_flags);
333 	sel->buf_list = io_buffer_get_list(ctx, arg->buf_group);
334 	if (unlikely(!sel->buf_list))
335 		goto out_unlock;
336 
337 	if (sel->buf_list->flags & IOBL_BUF_RING) {
338 		ret = io_ring_buffers_peek(req, arg, sel->buf_list);
339 		/*
340 		 * Don't recycle these buffers if we need to go through poll.
341 		 * Nobody else can use them anyway, and holding on to provided
342 		 * buffers for a send/write operation would happen on the app
343 		 * side anyway with normal buffers. Besides, we already
344 		 * committed them, they cannot be put back in the queue.
345 		 */
346 		if (ret > 0) {
347 			req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
348 			io_kbuf_commit(req, sel->buf_list, arg->out_len, ret);
349 		}
350 	} else {
351 		ret = io_provided_buffers_select(req, &arg->out_len, sel->buf_list, arg->iovs);
352 	}
353 out_unlock:
354 	if (issue_flags & IO_URING_F_UNLOCKED) {
355 		sel->buf_list = NULL;
356 		mutex_unlock(&ctx->uring_lock);
357 	}
358 	return ret;
359 }
360 
361 int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
362 		    struct io_br_sel *sel)
363 {
364 	struct io_ring_ctx *ctx = req->ctx;
365 	struct io_buffer_list *bl;
366 	int ret;
367 
368 	lockdep_assert_held(&ctx->uring_lock);
369 
370 	bl = io_buffer_get_list(ctx, arg->buf_group);
371 	if (unlikely(!bl))
372 		return -ENOENT;
373 
374 	if (bl->flags & IOBL_BUF_RING) {
375 		ret = io_ring_buffers_peek(req, arg, bl);
376 		if (ret > 0)
377 			req->flags |= REQ_F_BUFFERS_COMMIT;
378 		sel->buf_list = bl;
379 		return ret;
380 	}
381 
382 	/* don't support multiple buffer selections for legacy */
383 	sel->buf_list = NULL;
384 	return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs);
385 }
386 
387 static inline bool __io_put_kbuf_ring(struct io_kiocb *req,
388 				      struct io_buffer_list *bl, int len, int nr)
389 {
390 	bool ret = true;
391 
392 	if (bl)
393 		ret = io_kbuf_commit(req, bl, len, nr);
394 
395 	req->flags &= ~REQ_F_BUFFER_RING;
396 	return ret;
397 }
398 
399 unsigned int __io_put_kbufs(struct io_kiocb *req, struct io_buffer_list *bl,
400 			    int len, int nbufs)
401 {
402 	unsigned int ret;
403 
404 	ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
405 
406 	if (unlikely(!(req->flags & REQ_F_BUFFER_RING))) {
407 		io_kbuf_drop_legacy(req);
408 		return ret;
409 	}
410 
411 	if (!__io_put_kbuf_ring(req, bl, len, nbufs))
412 		ret |= IORING_CQE_F_BUF_MORE;
413 	return ret;
414 }
415 
416 static int io_remove_buffers_legacy(struct io_ring_ctx *ctx,
417 				    struct io_buffer_list *bl,
418 				    unsigned long nbufs)
419 {
420 	unsigned long i = 0;
421 	struct io_buffer *nxt;
422 
423 	/* protects io_buffers_cache */
424 	lockdep_assert_held(&ctx->uring_lock);
425 	WARN_ON_ONCE(bl->flags & IOBL_BUF_RING);
426 
427 	for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) {
428 		nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
429 		list_del(&nxt->list);
430 		bl->nbufs--;
431 		kfree(nxt);
432 		cond_resched();
433 	}
434 	return i;
435 }
436 
437 static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
438 {
439 	if (bl->flags & IOBL_BUF_RING)
440 		io_free_region(ctx->user, &bl->region);
441 	else
442 		io_remove_buffers_legacy(ctx, bl, -1U);
443 
444 	kfree(bl);
445 }
446 
447 void io_destroy_buffers(struct io_ring_ctx *ctx)
448 {
449 	struct io_buffer_list *bl;
450 
451 	while (1) {
452 		unsigned long index = 0;
453 
454 		scoped_guard(mutex, &ctx->mmap_lock) {
455 			bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT);
456 			if (bl)
457 				xa_erase(&ctx->io_bl_xa, bl->bgid);
458 		}
459 		if (!bl)
460 			break;
461 		io_put_bl(ctx, bl);
462 	}
463 }
464 
465 static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
466 {
467 	scoped_guard(mutex, &ctx->mmap_lock)
468 		WARN_ON_ONCE(xa_erase(&ctx->io_bl_xa, bl->bgid) != bl);
469 	io_put_bl(ctx, bl);
470 }
471 
472 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
473 {
474 	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
475 	u64 tmp;
476 
477 	if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
478 	    sqe->splice_fd_in)
479 		return -EINVAL;
480 
481 	tmp = READ_ONCE(sqe->fd);
482 	if (!tmp || tmp > MAX_BIDS_PER_BGID)
483 		return -EINVAL;
484 
485 	memset(p, 0, sizeof(*p));
486 	p->nbufs = tmp;
487 	p->bgid = READ_ONCE(sqe->buf_group);
488 	return 0;
489 }
490 
491 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
492 {
493 	unsigned long size, tmp_check;
494 	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
495 	u64 tmp;
496 
497 	if (sqe->rw_flags || sqe->splice_fd_in)
498 		return -EINVAL;
499 
500 	tmp = READ_ONCE(sqe->fd);
501 	if (!tmp || tmp > MAX_BIDS_PER_BGID)
502 		return -E2BIG;
503 	p->nbufs = tmp;
504 	p->addr = READ_ONCE(sqe->addr);
505 	p->len = READ_ONCE(sqe->len);
506 	if (!p->len)
507 		return -EINVAL;
508 
509 	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
510 				&size))
511 		return -EOVERFLOW;
512 	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
513 		return -EOVERFLOW;
514 	if (!access_ok(u64_to_user_ptr(p->addr), size))
515 		return -EFAULT;
516 
517 	p->bgid = READ_ONCE(sqe->buf_group);
518 	tmp = READ_ONCE(sqe->off);
519 	if (tmp > USHRT_MAX)
520 		return -E2BIG;
521 	if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
522 		return -EINVAL;
523 	p->bid = tmp;
524 	return 0;
525 }
526 
527 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
528 			  struct io_buffer_list *bl)
529 {
530 	struct io_buffer *buf;
531 	u64 addr = pbuf->addr;
532 	int ret = -ENOMEM, i, bid = pbuf->bid;
533 
534 	for (i = 0; i < pbuf->nbufs; i++) {
535 		/*
536 		 * Nonsensical to have more than sizeof(bid) buffers in a
537 		 * buffer list, as the application then has no way of knowing
538 		 * which duplicate bid refers to what buffer.
539 		 */
540 		if (bl->nbufs == USHRT_MAX) {
541 			ret = -EOVERFLOW;
542 			break;
543 		}
544 		buf = kmalloc_obj(*buf, GFP_KERNEL_ACCOUNT);
545 		if (!buf)
546 			break;
547 
548 		list_add_tail(&buf->list, &bl->buf_list);
549 		bl->nbufs++;
550 		buf->addr = addr;
551 		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
552 		buf->bid = bid;
553 		buf->bgid = pbuf->bgid;
554 		addr += pbuf->len;
555 		bid++;
556 		cond_resched();
557 	}
558 
559 	return i ? 0 : ret;
560 }
561 
562 static int __io_manage_buffers_legacy(struct io_kiocb *req,
563 					struct io_buffer_list *bl)
564 {
565 	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
566 	int ret;
567 
568 	if (!bl) {
569 		if (req->opcode != IORING_OP_PROVIDE_BUFFERS)
570 			return -ENOENT;
571 		bl = kzalloc_obj(*bl, GFP_KERNEL_ACCOUNT);
572 		if (!bl)
573 			return -ENOMEM;
574 
575 		INIT_LIST_HEAD(&bl->buf_list);
576 		ret = io_buffer_add_list(req->ctx, bl, p->bgid);
577 		if (ret) {
578 			kfree(bl);
579 			return ret;
580 		}
581 	}
582 	/* can't use provide/remove buffers command on mapped buffers */
583 	if (bl->flags & IOBL_BUF_RING)
584 		return -EINVAL;
585 	if (req->opcode == IORING_OP_PROVIDE_BUFFERS)
586 		return io_add_buffers(req->ctx, p, bl);
587 	return io_remove_buffers_legacy(req->ctx, bl, p->nbufs);
588 }
589 
590 int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags)
591 {
592 	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
593 	struct io_ring_ctx *ctx = req->ctx;
594 	struct io_buffer_list *bl;
595 	int ret;
596 
597 	io_ring_submit_lock(ctx, issue_flags);
598 	bl = io_buffer_get_list(ctx, p->bgid);
599 	ret = __io_manage_buffers_legacy(req, bl);
600 	io_ring_submit_unlock(ctx, issue_flags);
601 
602 	if (ret < 0)
603 		req_set_fail(req);
604 	io_req_set_res(req, ret, 0);
605 	return IOU_COMPLETE;
606 }
607 
608 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
609 {
610 	struct io_uring_buf_reg reg;
611 	struct io_buffer_list *bl;
612 	struct io_uring_region_desc rd;
613 	struct io_uring_buf_ring *br;
614 	unsigned long mmap_offset;
615 	unsigned long ring_size;
616 	int ret;
617 
618 	lockdep_assert_held(&ctx->uring_lock);
619 
620 	if (copy_from_user(&reg, arg, sizeof(reg)))
621 		return -EFAULT;
622 	if (!mem_is_zero(reg.resv, sizeof(reg.resv)))
623 		return -EINVAL;
624 	if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
625 		return -EINVAL;
626 	if (!is_power_of_2(reg.ring_entries))
627 		return -EINVAL;
628 	/* cannot disambiguate full vs empty due to head/tail size */
629 	if (reg.ring_entries >= 65536)
630 		return -EINVAL;
631 
632 	bl = io_buffer_get_list(ctx, reg.bgid);
633 	if (bl) {
634 		/* if mapped buffer ring OR classic exists, don't allow */
635 		if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list))
636 			return -EEXIST;
637 		io_destroy_bl(ctx, bl);
638 	}
639 
640 	bl = kzalloc_obj(*bl, GFP_KERNEL_ACCOUNT);
641 	if (!bl)
642 		return -ENOMEM;
643 
644 	mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT;
645 	ring_size = flex_array_size(br, bufs, reg.ring_entries);
646 
647 	memset(&rd, 0, sizeof(rd));
648 	rd.size = PAGE_ALIGN(ring_size);
649 	if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
650 		rd.user_addr = reg.ring_addr;
651 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
652 	}
653 	ret = io_create_region(ctx, &bl->region, &rd, mmap_offset);
654 	if (ret)
655 		goto fail;
656 	br = io_region_get_ptr(&bl->region);
657 
658 #ifdef SHM_COLOUR
659 	/*
660 	 * On platforms that have specific aliasing requirements, SHM_COLOUR
661 	 * is set and we must guarantee that the kernel and user side align
662 	 * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
663 	 * the application mmap's the provided ring buffer. Fail the request
664 	 * if we, by chance, don't end up with aligned addresses. The app
665 	 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
666 	 * this transparently.
667 	 */
668 	if (!(reg.flags & IOU_PBUF_RING_MMAP) &&
669 	    ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) {
670 		ret = -EINVAL;
671 		goto fail;
672 	}
673 #endif
674 
675 	bl->nr_entries = reg.ring_entries;
676 	bl->mask = reg.ring_entries - 1;
677 	bl->flags |= IOBL_BUF_RING;
678 	bl->buf_ring = br;
679 	if (reg.flags & IOU_PBUF_RING_INC)
680 		bl->flags |= IOBL_INC;
681 	ret = io_buffer_add_list(ctx, bl, reg.bgid);
682 	if (!ret)
683 		return 0;
684 fail:
685 	io_free_region(ctx->user, &bl->region);
686 	kfree(bl);
687 	return ret;
688 }
689 
690 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
691 {
692 	struct io_uring_buf_reg reg;
693 	struct io_buffer_list *bl;
694 
695 	lockdep_assert_held(&ctx->uring_lock);
696 
697 	if (copy_from_user(&reg, arg, sizeof(reg)))
698 		return -EFAULT;
699 	if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags)
700 		return -EINVAL;
701 
702 	bl = io_buffer_get_list(ctx, reg.bgid);
703 	if (!bl)
704 		return -ENOENT;
705 	if (!(bl->flags & IOBL_BUF_RING))
706 		return -EINVAL;
707 
708 	scoped_guard(mutex, &ctx->mmap_lock)
709 		xa_erase(&ctx->io_bl_xa, bl->bgid);
710 
711 	io_put_bl(ctx, bl);
712 	return 0;
713 }
714 
715 int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
716 {
717 	struct io_uring_buf_status buf_status;
718 	struct io_buffer_list *bl;
719 
720 	if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
721 		return -EFAULT;
722 	if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv)))
723 		return -EINVAL;
724 
725 	bl = io_buffer_get_list(ctx, buf_status.buf_group);
726 	if (!bl)
727 		return -ENOENT;
728 	if (!(bl->flags & IOBL_BUF_RING))
729 		return -EINVAL;
730 
731 	buf_status.head = bl->head;
732 	if (copy_to_user(arg, &buf_status, sizeof(buf_status)))
733 		return -EFAULT;
734 
735 	return 0;
736 }
737 
738 struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
739 					    unsigned int bgid)
740 {
741 	struct io_buffer_list *bl;
742 
743 	lockdep_assert_held(&ctx->mmap_lock);
744 
745 	bl = xa_load(&ctx->io_bl_xa, bgid);
746 	if (!bl || !(bl->flags & IOBL_BUF_RING))
747 		return NULL;
748 	return &bl->region;
749 }
750