1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/namei.h> 9 #include <linux/poll.h> 10 #include <linux/io_uring.h> 11 12 #include <uapi/linux/io_uring.h> 13 14 #include "io_uring.h" 15 #include "opdef.h" 16 #include "kbuf.h" 17 18 #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) 19 20 #define BGID_ARRAY 64 21 22 /* BIDs are addressed by a 16-bit field in a CQE */ 23 #define MAX_BIDS_PER_BGID (1 << 16) 24 25 struct kmem_cache *io_buf_cachep; 26 27 struct io_provide_buf { 28 struct file *file; 29 __u64 addr; 30 __u32 len; 31 __u32 bgid; 32 __u32 nbufs; 33 __u16 bid; 34 }; 35 36 struct io_buf_free { 37 struct hlist_node list; 38 void *mem; 39 size_t size; 40 int inuse; 41 }; 42 43 static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, 44 struct io_buffer_list *bl, 45 unsigned int bgid) 46 { 47 if (bl && bgid < BGID_ARRAY) 48 return &bl[bgid]; 49 50 return xa_load(&ctx->io_bl_xa, bgid); 51 } 52 53 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, 54 unsigned int bgid) 55 { 56 lockdep_assert_held(&ctx->uring_lock); 57 58 return __io_buffer_get_list(ctx, ctx->io_bl, bgid); 59 } 60 61 static int io_buffer_add_list(struct io_ring_ctx *ctx, 62 struct io_buffer_list *bl, unsigned int bgid) 63 { 64 /* 65 * Store buffer group ID and finally mark the list as visible. 66 * The normal lookup doesn't care about the visibility as we're 67 * always under the ->uring_lock, but the RCU lookup from mmap does. 68 */ 69 bl->bgid = bgid; 70 smp_store_release(&bl->is_ready, 1); 71 72 if (bgid < BGID_ARRAY) 73 return 0; 74 75 return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); 76 } 77 78 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) 79 { 80 struct io_ring_ctx *ctx = req->ctx; 81 struct io_buffer_list *bl; 82 struct io_buffer *buf; 83 84 /* 85 * For legacy provided buffer mode, don't recycle if we already did 86 * IO to this buffer. For ring-mapped provided buffer mode, we should 87 * increment ring->head to explicitly monopolize the buffer to avoid 88 * multiple use. 89 */ 90 if (req->flags & REQ_F_PARTIAL_IO) 91 return false; 92 93 io_ring_submit_lock(ctx, issue_flags); 94 95 buf = req->kbuf; 96 bl = io_buffer_get_list(ctx, buf->bgid); 97 list_add(&buf->list, &bl->buf_list); 98 req->flags &= ~REQ_F_BUFFER_SELECTED; 99 req->buf_index = buf->bgid; 100 101 io_ring_submit_unlock(ctx, issue_flags); 102 return true; 103 } 104 105 unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) 106 { 107 unsigned int cflags; 108 109 /* 110 * We can add this buffer back to two lists: 111 * 112 * 1) The io_buffers_cache list. This one is protected by the 113 * ctx->uring_lock. If we already hold this lock, add back to this 114 * list as we can grab it from issue as well. 115 * 2) The io_buffers_comp list. This one is protected by the 116 * ctx->completion_lock. 117 * 118 * We migrate buffers from the comp_list to the issue cache list 119 * when we need one. 120 */ 121 if (req->flags & REQ_F_BUFFER_RING) { 122 /* no buffers to recycle for this case */ 123 cflags = __io_put_kbuf_list(req, NULL); 124 } else if (issue_flags & IO_URING_F_UNLOCKED) { 125 struct io_ring_ctx *ctx = req->ctx; 126 127 spin_lock(&ctx->completion_lock); 128 cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp); 129 spin_unlock(&ctx->completion_lock); 130 } else { 131 lockdep_assert_held(&req->ctx->uring_lock); 132 133 cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); 134 } 135 return cflags; 136 } 137 138 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, 139 struct io_buffer_list *bl) 140 { 141 if (!list_empty(&bl->buf_list)) { 142 struct io_buffer *kbuf; 143 144 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); 145 list_del(&kbuf->list); 146 if (*len == 0 || *len > kbuf->len) 147 *len = kbuf->len; 148 req->flags |= REQ_F_BUFFER_SELECTED; 149 req->kbuf = kbuf; 150 req->buf_index = kbuf->bid; 151 return u64_to_user_ptr(kbuf->addr); 152 } 153 return NULL; 154 } 155 156 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, 157 struct io_buffer_list *bl, 158 unsigned int issue_flags) 159 { 160 struct io_uring_buf_ring *br = bl->buf_ring; 161 struct io_uring_buf *buf; 162 __u16 head = bl->head; 163 164 if (unlikely(smp_load_acquire(&br->tail) == head)) 165 return NULL; 166 167 head &= bl->mask; 168 /* mmaped buffers are always contig */ 169 if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { 170 buf = &br->bufs[head]; 171 } else { 172 int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); 173 int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; 174 buf = page_address(bl->buf_pages[index]); 175 buf += off; 176 } 177 if (*len == 0 || *len > buf->len) 178 *len = buf->len; 179 req->flags |= REQ_F_BUFFER_RING; 180 req->buf_list = bl; 181 req->buf_index = buf->bid; 182 183 if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { 184 /* 185 * If we came in unlocked, we have no choice but to consume the 186 * buffer here, otherwise nothing ensures that the buffer won't 187 * get used by others. This does mean it'll be pinned until the 188 * IO completes, coming in unlocked means we're being called from 189 * io-wq context and there may be further retries in async hybrid 190 * mode. For the locked case, the caller must call commit when 191 * the transfer completes (or if we get -EAGAIN and must poll of 192 * retry). 193 */ 194 req->buf_list = NULL; 195 bl->head++; 196 } 197 return u64_to_user_ptr(buf->addr); 198 } 199 200 void __user *io_buffer_select(struct io_kiocb *req, size_t *len, 201 unsigned int issue_flags) 202 { 203 struct io_ring_ctx *ctx = req->ctx; 204 struct io_buffer_list *bl; 205 void __user *ret = NULL; 206 207 io_ring_submit_lock(req->ctx, issue_flags); 208 209 bl = io_buffer_get_list(ctx, req->buf_index); 210 if (likely(bl)) { 211 if (bl->is_mapped) 212 ret = io_ring_buffer_select(req, len, bl, issue_flags); 213 else 214 ret = io_provided_buffer_select(req, len, bl); 215 } 216 io_ring_submit_unlock(req->ctx, issue_flags); 217 return ret; 218 } 219 220 static __cold int io_init_bl_list(struct io_ring_ctx *ctx) 221 { 222 struct io_buffer_list *bl; 223 int i; 224 225 bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL); 226 if (!bl) 227 return -ENOMEM; 228 229 for (i = 0; i < BGID_ARRAY; i++) { 230 INIT_LIST_HEAD(&bl[i].buf_list); 231 bl[i].bgid = i; 232 } 233 234 smp_store_release(&ctx->io_bl, bl); 235 return 0; 236 } 237 238 /* 239 * Mark the given mapped range as free for reuse 240 */ 241 static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl) 242 { 243 struct io_buf_free *ibf; 244 245 hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { 246 if (bl->buf_ring == ibf->mem) { 247 ibf->inuse = 0; 248 return; 249 } 250 } 251 252 /* can't happen... */ 253 WARN_ON_ONCE(1); 254 } 255 256 static int __io_remove_buffers(struct io_ring_ctx *ctx, 257 struct io_buffer_list *bl, unsigned nbufs) 258 { 259 unsigned i = 0; 260 261 /* shouldn't happen */ 262 if (!nbufs) 263 return 0; 264 265 if (bl->is_mapped) { 266 i = bl->buf_ring->tail - bl->head; 267 if (bl->is_mmap) { 268 /* 269 * io_kbuf_list_free() will free the page(s) at 270 * ->release() time. 271 */ 272 io_kbuf_mark_free(ctx, bl); 273 bl->buf_ring = NULL; 274 bl->is_mmap = 0; 275 } else if (bl->buf_nr_pages) { 276 int j; 277 278 for (j = 0; j < bl->buf_nr_pages; j++) 279 unpin_user_page(bl->buf_pages[j]); 280 kvfree(bl->buf_pages); 281 bl->buf_pages = NULL; 282 bl->buf_nr_pages = 0; 283 } 284 /* make sure it's seen as empty */ 285 INIT_LIST_HEAD(&bl->buf_list); 286 bl->is_mapped = 0; 287 return i; 288 } 289 290 /* protects io_buffers_cache */ 291 lockdep_assert_held(&ctx->uring_lock); 292 293 while (!list_empty(&bl->buf_list)) { 294 struct io_buffer *nxt; 295 296 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); 297 list_move(&nxt->list, &ctx->io_buffers_cache); 298 if (++i == nbufs) 299 return i; 300 cond_resched(); 301 } 302 303 return i; 304 } 305 306 void io_destroy_buffers(struct io_ring_ctx *ctx) 307 { 308 struct io_buffer_list *bl; 309 struct list_head *item, *tmp; 310 struct io_buffer *buf; 311 unsigned long index; 312 int i; 313 314 for (i = 0; i < BGID_ARRAY; i++) { 315 if (!ctx->io_bl) 316 break; 317 __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); 318 } 319 320 xa_for_each(&ctx->io_bl_xa, index, bl) { 321 xa_erase(&ctx->io_bl_xa, bl->bgid); 322 __io_remove_buffers(ctx, bl, -1U); 323 kfree_rcu(bl, rcu); 324 } 325 326 /* 327 * Move deferred locked entries to cache before pruning 328 */ 329 spin_lock(&ctx->completion_lock); 330 if (!list_empty(&ctx->io_buffers_comp)) 331 list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache); 332 spin_unlock(&ctx->completion_lock); 333 334 list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { 335 buf = list_entry(item, struct io_buffer, list); 336 kmem_cache_free(io_buf_cachep, buf); 337 } 338 } 339 340 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 341 { 342 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 343 u64 tmp; 344 345 if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 346 sqe->splice_fd_in) 347 return -EINVAL; 348 349 tmp = READ_ONCE(sqe->fd); 350 if (!tmp || tmp > MAX_BIDS_PER_BGID) 351 return -EINVAL; 352 353 memset(p, 0, sizeof(*p)); 354 p->nbufs = tmp; 355 p->bgid = READ_ONCE(sqe->buf_group); 356 return 0; 357 } 358 359 int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) 360 { 361 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 362 struct io_ring_ctx *ctx = req->ctx; 363 struct io_buffer_list *bl; 364 int ret = 0; 365 366 io_ring_submit_lock(ctx, issue_flags); 367 368 ret = -ENOENT; 369 bl = io_buffer_get_list(ctx, p->bgid); 370 if (bl) { 371 ret = -EINVAL; 372 /* can't use provide/remove buffers command on mapped buffers */ 373 if (!bl->is_mapped) 374 ret = __io_remove_buffers(ctx, bl, p->nbufs); 375 } 376 io_ring_submit_unlock(ctx, issue_flags); 377 if (ret < 0) 378 req_set_fail(req); 379 io_req_set_res(req, ret, 0); 380 return IOU_OK; 381 } 382 383 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 384 { 385 unsigned long size, tmp_check; 386 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 387 u64 tmp; 388 389 if (sqe->rw_flags || sqe->splice_fd_in) 390 return -EINVAL; 391 392 tmp = READ_ONCE(sqe->fd); 393 if (!tmp || tmp > MAX_BIDS_PER_BGID) 394 return -E2BIG; 395 p->nbufs = tmp; 396 p->addr = READ_ONCE(sqe->addr); 397 p->len = READ_ONCE(sqe->len); 398 399 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, 400 &size)) 401 return -EOVERFLOW; 402 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) 403 return -EOVERFLOW; 404 405 size = (unsigned long)p->len * p->nbufs; 406 if (!access_ok(u64_to_user_ptr(p->addr), size)) 407 return -EFAULT; 408 409 p->bgid = READ_ONCE(sqe->buf_group); 410 tmp = READ_ONCE(sqe->off); 411 if (tmp > USHRT_MAX) 412 return -E2BIG; 413 if (tmp + p->nbufs > MAX_BIDS_PER_BGID) 414 return -EINVAL; 415 p->bid = tmp; 416 return 0; 417 } 418 419 #define IO_BUFFER_ALLOC_BATCH 64 420 421 static int io_refill_buffer_cache(struct io_ring_ctx *ctx) 422 { 423 struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH]; 424 int allocated; 425 426 /* 427 * Completions that don't happen inline (eg not under uring_lock) will 428 * add to ->io_buffers_comp. If we don't have any free buffers, check 429 * the completion list and splice those entries first. 430 */ 431 if (!list_empty_careful(&ctx->io_buffers_comp)) { 432 spin_lock(&ctx->completion_lock); 433 if (!list_empty(&ctx->io_buffers_comp)) { 434 list_splice_init(&ctx->io_buffers_comp, 435 &ctx->io_buffers_cache); 436 spin_unlock(&ctx->completion_lock); 437 return 0; 438 } 439 spin_unlock(&ctx->completion_lock); 440 } 441 442 /* 443 * No free buffers and no completion entries either. Allocate a new 444 * batch of buffer entries and add those to our freelist. 445 */ 446 447 allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT, 448 ARRAY_SIZE(bufs), (void **) bufs); 449 if (unlikely(!allocated)) { 450 /* 451 * Bulk alloc is all-or-nothing. If we fail to get a batch, 452 * retry single alloc to be on the safe side. 453 */ 454 bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL); 455 if (!bufs[0]) 456 return -ENOMEM; 457 allocated = 1; 458 } 459 460 while (allocated) 461 list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache); 462 463 return 0; 464 } 465 466 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, 467 struct io_buffer_list *bl) 468 { 469 struct io_buffer *buf; 470 u64 addr = pbuf->addr; 471 int i, bid = pbuf->bid; 472 473 for (i = 0; i < pbuf->nbufs; i++) { 474 if (list_empty(&ctx->io_buffers_cache) && 475 io_refill_buffer_cache(ctx)) 476 break; 477 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, 478 list); 479 list_move_tail(&buf->list, &bl->buf_list); 480 buf->addr = addr; 481 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 482 buf->bid = bid; 483 buf->bgid = pbuf->bgid; 484 addr += pbuf->len; 485 bid++; 486 cond_resched(); 487 } 488 489 return i ? 0 : -ENOMEM; 490 } 491 492 int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 493 { 494 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 495 struct io_ring_ctx *ctx = req->ctx; 496 struct io_buffer_list *bl; 497 int ret = 0; 498 499 io_ring_submit_lock(ctx, issue_flags); 500 501 if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { 502 ret = io_init_bl_list(ctx); 503 if (ret) 504 goto err; 505 } 506 507 bl = io_buffer_get_list(ctx, p->bgid); 508 if (unlikely(!bl)) { 509 bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); 510 if (!bl) { 511 ret = -ENOMEM; 512 goto err; 513 } 514 INIT_LIST_HEAD(&bl->buf_list); 515 ret = io_buffer_add_list(ctx, bl, p->bgid); 516 if (ret) { 517 /* 518 * Doesn't need rcu free as it was never visible, but 519 * let's keep it consistent throughout. Also can't 520 * be a lower indexed array group, as adding one 521 * where lookup failed cannot happen. 522 */ 523 if (p->bgid >= BGID_ARRAY) 524 kfree_rcu(bl, rcu); 525 else 526 WARN_ON_ONCE(1); 527 goto err; 528 } 529 } 530 /* can't add buffers via this command for a mapped buffer ring */ 531 if (bl->is_mapped) { 532 ret = -EINVAL; 533 goto err; 534 } 535 536 ret = io_add_buffers(ctx, p, bl); 537 err: 538 io_ring_submit_unlock(ctx, issue_flags); 539 540 if (ret < 0) 541 req_set_fail(req); 542 io_req_set_res(req, ret, 0); 543 return IOU_OK; 544 } 545 546 static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, 547 struct io_buffer_list *bl) 548 { 549 struct io_uring_buf_ring *br; 550 struct page **pages; 551 int i, nr_pages; 552 553 pages = io_pin_pages(reg->ring_addr, 554 flex_array_size(br, bufs, reg->ring_entries), 555 &nr_pages); 556 if (IS_ERR(pages)) 557 return PTR_ERR(pages); 558 559 /* 560 * Apparently some 32-bit boxes (ARM) will return highmem pages, 561 * which then need to be mapped. We could support that, but it'd 562 * complicate the code and slowdown the common cases quite a bit. 563 * So just error out, returning -EINVAL just like we did on kernels 564 * that didn't support mapped buffer rings. 565 */ 566 for (i = 0; i < nr_pages; i++) 567 if (PageHighMem(pages[i])) 568 goto error_unpin; 569 570 br = page_address(pages[0]); 571 #ifdef SHM_COLOUR 572 /* 573 * On platforms that have specific aliasing requirements, SHM_COLOUR 574 * is set and we must guarantee that the kernel and user side align 575 * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and 576 * the application mmap's the provided ring buffer. Fail the request 577 * if we, by chance, don't end up with aligned addresses. The app 578 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle 579 * this transparently. 580 */ 581 if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) 582 goto error_unpin; 583 #endif 584 bl->buf_pages = pages; 585 bl->buf_nr_pages = nr_pages; 586 bl->buf_ring = br; 587 bl->is_mapped = 1; 588 bl->is_mmap = 0; 589 return 0; 590 error_unpin: 591 for (i = 0; i < nr_pages; i++) 592 unpin_user_page(pages[i]); 593 kvfree(pages); 594 return -EINVAL; 595 } 596 597 /* 598 * See if we have a suitable region that we can reuse, rather than allocate 599 * both a new io_buf_free and mem region again. We leave it on the list as 600 * even a reused entry will need freeing at ring release. 601 */ 602 static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx, 603 size_t ring_size) 604 { 605 struct io_buf_free *ibf, *best = NULL; 606 size_t best_dist; 607 608 hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { 609 size_t dist; 610 611 if (ibf->inuse || ibf->size < ring_size) 612 continue; 613 dist = ibf->size - ring_size; 614 if (!best || dist < best_dist) { 615 best = ibf; 616 if (!dist) 617 break; 618 best_dist = dist; 619 } 620 } 621 622 return best; 623 } 624 625 static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, 626 struct io_uring_buf_reg *reg, 627 struct io_buffer_list *bl) 628 { 629 struct io_buf_free *ibf; 630 size_t ring_size; 631 void *ptr; 632 633 ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); 634 635 /* Reuse existing entry, if we can */ 636 ibf = io_lookup_buf_free_entry(ctx, ring_size); 637 if (!ibf) { 638 ptr = io_mem_alloc(ring_size); 639 if (!ptr) 640 return -ENOMEM; 641 642 /* Allocate and store deferred free entry */ 643 ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); 644 if (!ibf) { 645 io_mem_free(ptr); 646 return -ENOMEM; 647 } 648 ibf->mem = ptr; 649 ibf->size = ring_size; 650 hlist_add_head(&ibf->list, &ctx->io_buf_list); 651 } 652 ibf->inuse = 1; 653 bl->buf_ring = ibf->mem; 654 bl->is_mapped = 1; 655 bl->is_mmap = 1; 656 return 0; 657 } 658 659 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 660 { 661 struct io_uring_buf_reg reg; 662 struct io_buffer_list *bl, *free_bl = NULL; 663 int ret; 664 665 lockdep_assert_held(&ctx->uring_lock); 666 667 if (copy_from_user(®, arg, sizeof(reg))) 668 return -EFAULT; 669 670 if (reg.resv[0] || reg.resv[1] || reg.resv[2]) 671 return -EINVAL; 672 if (reg.flags & ~IOU_PBUF_RING_MMAP) 673 return -EINVAL; 674 if (!(reg.flags & IOU_PBUF_RING_MMAP)) { 675 if (!reg.ring_addr) 676 return -EFAULT; 677 if (reg.ring_addr & ~PAGE_MASK) 678 return -EINVAL; 679 } else { 680 if (reg.ring_addr) 681 return -EINVAL; 682 } 683 684 if (!is_power_of_2(reg.ring_entries)) 685 return -EINVAL; 686 687 /* cannot disambiguate full vs empty due to head/tail size */ 688 if (reg.ring_entries >= 65536) 689 return -EINVAL; 690 691 if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { 692 int ret = io_init_bl_list(ctx); 693 if (ret) 694 return ret; 695 } 696 697 bl = io_buffer_get_list(ctx, reg.bgid); 698 if (bl) { 699 /* if mapped buffer ring OR classic exists, don't allow */ 700 if (bl->is_mapped || !list_empty(&bl->buf_list)) 701 return -EEXIST; 702 } else { 703 free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); 704 if (!bl) 705 return -ENOMEM; 706 } 707 708 if (!(reg.flags & IOU_PBUF_RING_MMAP)) 709 ret = io_pin_pbuf_ring(®, bl); 710 else 711 ret = io_alloc_pbuf_ring(ctx, ®, bl); 712 713 if (!ret) { 714 bl->nr_entries = reg.ring_entries; 715 bl->mask = reg.ring_entries - 1; 716 717 io_buffer_add_list(ctx, bl, reg.bgid); 718 return 0; 719 } 720 721 kfree_rcu(free_bl, rcu); 722 return ret; 723 } 724 725 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 726 { 727 struct io_uring_buf_reg reg; 728 struct io_buffer_list *bl; 729 730 lockdep_assert_held(&ctx->uring_lock); 731 732 if (copy_from_user(®, arg, sizeof(reg))) 733 return -EFAULT; 734 if (reg.resv[0] || reg.resv[1] || reg.resv[2]) 735 return -EINVAL; 736 if (reg.flags) 737 return -EINVAL; 738 739 bl = io_buffer_get_list(ctx, reg.bgid); 740 if (!bl) 741 return -ENOENT; 742 if (!bl->is_mapped) 743 return -EINVAL; 744 745 __io_remove_buffers(ctx, bl, -1U); 746 if (bl->bgid >= BGID_ARRAY) { 747 xa_erase(&ctx->io_bl_xa, bl->bgid); 748 kfree_rcu(bl, rcu); 749 } 750 return 0; 751 } 752 753 void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) 754 { 755 struct io_buffer_list *bl; 756 757 bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid); 758 759 /* 760 * Ensure the list is fully setup. Only strictly needed for RCU lookup 761 * via mmap, and in that case only for the array indexed groups. For 762 * the xarray lookups, it's either visible and ready, or not at all. 763 */ 764 if (!smp_load_acquire(&bl->is_ready)) 765 return NULL; 766 if (!bl || !bl->is_mmap) 767 return NULL; 768 769 return bl->buf_ring; 770 } 771 772 /* 773 * Called at or after ->release(), free the mmap'ed buffers that we used 774 * for memory mapped provided buffer rings. 775 */ 776 void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) 777 { 778 struct io_buf_free *ibf; 779 struct hlist_node *tmp; 780 781 hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { 782 hlist_del(&ibf->list); 783 io_mem_free(ibf->mem); 784 kfree(ibf); 785 } 786 } 787