1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/namei.h> 9 #include <linux/poll.h> 10 #include <linux/io_uring.h> 11 12 #include <uapi/linux/io_uring.h> 13 14 #include "io_uring.h" 15 #include "opdef.h" 16 #include "kbuf.h" 17 18 #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) 19 20 #define BGID_ARRAY 64 21 22 /* BIDs are addressed by a 16-bit field in a CQE */ 23 #define MAX_BIDS_PER_BGID (1 << 16) 24 25 struct kmem_cache *io_buf_cachep; 26 27 struct io_provide_buf { 28 struct file *file; 29 __u64 addr; 30 __u32 len; 31 __u32 bgid; 32 __u32 nbufs; 33 __u16 bid; 34 }; 35 36 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, 37 unsigned int bgid) 38 { 39 if (ctx->io_bl && bgid < BGID_ARRAY) 40 return &ctx->io_bl[bgid]; 41 42 return xa_load(&ctx->io_bl_xa, bgid); 43 } 44 45 static int io_buffer_add_list(struct io_ring_ctx *ctx, 46 struct io_buffer_list *bl, unsigned int bgid) 47 { 48 bl->bgid = bgid; 49 if (bgid < BGID_ARRAY) 50 return 0; 51 52 return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); 53 } 54 55 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) 56 { 57 struct io_ring_ctx *ctx = req->ctx; 58 struct io_buffer_list *bl; 59 struct io_buffer *buf; 60 61 /* 62 * For legacy provided buffer mode, don't recycle if we already did 63 * IO to this buffer. For ring-mapped provided buffer mode, we should 64 * increment ring->head to explicitly monopolize the buffer to avoid 65 * multiple use. 66 */ 67 if (req->flags & REQ_F_PARTIAL_IO) 68 return false; 69 70 io_ring_submit_lock(ctx, issue_flags); 71 72 buf = req->kbuf; 73 bl = io_buffer_get_list(ctx, buf->bgid); 74 list_add(&buf->list, &bl->buf_list); 75 req->flags &= ~REQ_F_BUFFER_SELECTED; 76 req->buf_index = buf->bgid; 77 78 io_ring_submit_unlock(ctx, issue_flags); 79 return true; 80 } 81 82 unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) 83 { 84 unsigned int cflags; 85 86 /* 87 * We can add this buffer back to two lists: 88 * 89 * 1) The io_buffers_cache list. This one is protected by the 90 * ctx->uring_lock. If we already hold this lock, add back to this 91 * list as we can grab it from issue as well. 92 * 2) The io_buffers_comp list. This one is protected by the 93 * ctx->completion_lock. 94 * 95 * We migrate buffers from the comp_list to the issue cache list 96 * when we need one. 97 */ 98 if (req->flags & REQ_F_BUFFER_RING) { 99 /* no buffers to recycle for this case */ 100 cflags = __io_put_kbuf_list(req, NULL); 101 } else if (issue_flags & IO_URING_F_UNLOCKED) { 102 struct io_ring_ctx *ctx = req->ctx; 103 104 spin_lock(&ctx->completion_lock); 105 cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp); 106 spin_unlock(&ctx->completion_lock); 107 } else { 108 lockdep_assert_held(&req->ctx->uring_lock); 109 110 cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); 111 } 112 return cflags; 113 } 114 115 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, 116 struct io_buffer_list *bl) 117 { 118 if (!list_empty(&bl->buf_list)) { 119 struct io_buffer *kbuf; 120 121 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); 122 list_del(&kbuf->list); 123 if (*len == 0 || *len > kbuf->len) 124 *len = kbuf->len; 125 req->flags |= REQ_F_BUFFER_SELECTED; 126 req->kbuf = kbuf; 127 req->buf_index = kbuf->bid; 128 return u64_to_user_ptr(kbuf->addr); 129 } 130 return NULL; 131 } 132 133 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, 134 struct io_buffer_list *bl, 135 unsigned int issue_flags) 136 { 137 struct io_uring_buf_ring *br = bl->buf_ring; 138 struct io_uring_buf *buf; 139 __u16 head = bl->head; 140 141 if (unlikely(smp_load_acquire(&br->tail) == head)) 142 return NULL; 143 144 head &= bl->mask; 145 /* mmaped buffers are always contig */ 146 if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { 147 buf = &br->bufs[head]; 148 } else { 149 int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); 150 int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; 151 buf = page_address(bl->buf_pages[index]); 152 buf += off; 153 } 154 if (*len == 0 || *len > buf->len) 155 *len = buf->len; 156 req->flags |= REQ_F_BUFFER_RING; 157 req->buf_list = bl; 158 req->buf_index = buf->bid; 159 160 if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { 161 /* 162 * If we came in unlocked, we have no choice but to consume the 163 * buffer here, otherwise nothing ensures that the buffer won't 164 * get used by others. This does mean it'll be pinned until the 165 * IO completes, coming in unlocked means we're being called from 166 * io-wq context and there may be further retries in async hybrid 167 * mode. For the locked case, the caller must call commit when 168 * the transfer completes (or if we get -EAGAIN and must poll of 169 * retry). 170 */ 171 req->buf_list = NULL; 172 bl->head++; 173 } 174 return u64_to_user_ptr(buf->addr); 175 } 176 177 void __user *io_buffer_select(struct io_kiocb *req, size_t *len, 178 unsigned int issue_flags) 179 { 180 struct io_ring_ctx *ctx = req->ctx; 181 struct io_buffer_list *bl; 182 void __user *ret = NULL; 183 184 io_ring_submit_lock(req->ctx, issue_flags); 185 186 bl = io_buffer_get_list(ctx, req->buf_index); 187 if (likely(bl)) { 188 if (bl->is_mapped) 189 ret = io_ring_buffer_select(req, len, bl, issue_flags); 190 else 191 ret = io_provided_buffer_select(req, len, bl); 192 } 193 io_ring_submit_unlock(req->ctx, issue_flags); 194 return ret; 195 } 196 197 static __cold int io_init_bl_list(struct io_ring_ctx *ctx) 198 { 199 int i; 200 201 ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), 202 GFP_KERNEL); 203 if (!ctx->io_bl) 204 return -ENOMEM; 205 206 for (i = 0; i < BGID_ARRAY; i++) { 207 INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); 208 ctx->io_bl[i].bgid = i; 209 } 210 211 return 0; 212 } 213 214 static int __io_remove_buffers(struct io_ring_ctx *ctx, 215 struct io_buffer_list *bl, unsigned nbufs) 216 { 217 unsigned i = 0; 218 219 /* shouldn't happen */ 220 if (!nbufs) 221 return 0; 222 223 if (bl->is_mapped) { 224 i = bl->buf_ring->tail - bl->head; 225 if (bl->is_mmap) { 226 folio_put(virt_to_folio(bl->buf_ring)); 227 bl->buf_ring = NULL; 228 bl->is_mmap = 0; 229 } else if (bl->buf_nr_pages) { 230 int j; 231 232 for (j = 0; j < bl->buf_nr_pages; j++) 233 unpin_user_page(bl->buf_pages[j]); 234 kvfree(bl->buf_pages); 235 bl->buf_pages = NULL; 236 bl->buf_nr_pages = 0; 237 } 238 /* make sure it's seen as empty */ 239 INIT_LIST_HEAD(&bl->buf_list); 240 bl->is_mapped = 0; 241 return i; 242 } 243 244 /* protects io_buffers_cache */ 245 lockdep_assert_held(&ctx->uring_lock); 246 247 while (!list_empty(&bl->buf_list)) { 248 struct io_buffer *nxt; 249 250 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); 251 list_move(&nxt->list, &ctx->io_buffers_cache); 252 if (++i == nbufs) 253 return i; 254 cond_resched(); 255 } 256 257 return i; 258 } 259 260 void io_destroy_buffers(struct io_ring_ctx *ctx) 261 { 262 struct io_buffer_list *bl; 263 struct list_head *item, *tmp; 264 struct io_buffer *buf; 265 unsigned long index; 266 int i; 267 268 for (i = 0; i < BGID_ARRAY; i++) { 269 if (!ctx->io_bl) 270 break; 271 __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); 272 } 273 274 xa_for_each(&ctx->io_bl_xa, index, bl) { 275 xa_erase(&ctx->io_bl_xa, bl->bgid); 276 __io_remove_buffers(ctx, bl, -1U); 277 kfree(bl); 278 } 279 280 list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { 281 buf = list_entry(item, struct io_buffer, list); 282 kmem_cache_free(io_buf_cachep, buf); 283 } 284 } 285 286 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 287 { 288 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 289 u64 tmp; 290 291 if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 292 sqe->splice_fd_in) 293 return -EINVAL; 294 295 tmp = READ_ONCE(sqe->fd); 296 if (!tmp || tmp > MAX_BIDS_PER_BGID) 297 return -EINVAL; 298 299 memset(p, 0, sizeof(*p)); 300 p->nbufs = tmp; 301 p->bgid = READ_ONCE(sqe->buf_group); 302 return 0; 303 } 304 305 int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) 306 { 307 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 308 struct io_ring_ctx *ctx = req->ctx; 309 struct io_buffer_list *bl; 310 int ret = 0; 311 312 io_ring_submit_lock(ctx, issue_flags); 313 314 ret = -ENOENT; 315 bl = io_buffer_get_list(ctx, p->bgid); 316 if (bl) { 317 ret = -EINVAL; 318 /* can't use provide/remove buffers command on mapped buffers */ 319 if (!bl->is_mapped) 320 ret = __io_remove_buffers(ctx, bl, p->nbufs); 321 } 322 io_ring_submit_unlock(ctx, issue_flags); 323 if (ret < 0) 324 req_set_fail(req); 325 io_req_set_res(req, ret, 0); 326 return IOU_OK; 327 } 328 329 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 330 { 331 unsigned long size, tmp_check; 332 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 333 u64 tmp; 334 335 if (sqe->rw_flags || sqe->splice_fd_in) 336 return -EINVAL; 337 338 tmp = READ_ONCE(sqe->fd); 339 if (!tmp || tmp > MAX_BIDS_PER_BGID) 340 return -E2BIG; 341 p->nbufs = tmp; 342 p->addr = READ_ONCE(sqe->addr); 343 p->len = READ_ONCE(sqe->len); 344 345 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, 346 &size)) 347 return -EOVERFLOW; 348 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) 349 return -EOVERFLOW; 350 351 size = (unsigned long)p->len * p->nbufs; 352 if (!access_ok(u64_to_user_ptr(p->addr), size)) 353 return -EFAULT; 354 355 p->bgid = READ_ONCE(sqe->buf_group); 356 tmp = READ_ONCE(sqe->off); 357 if (tmp > USHRT_MAX) 358 return -E2BIG; 359 if (tmp + p->nbufs > MAX_BIDS_PER_BGID) 360 return -EINVAL; 361 p->bid = tmp; 362 return 0; 363 } 364 365 #define IO_BUFFER_ALLOC_BATCH 64 366 367 static int io_refill_buffer_cache(struct io_ring_ctx *ctx) 368 { 369 struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH]; 370 int allocated; 371 372 /* 373 * Completions that don't happen inline (eg not under uring_lock) will 374 * add to ->io_buffers_comp. If we don't have any free buffers, check 375 * the completion list and splice those entries first. 376 */ 377 if (!list_empty_careful(&ctx->io_buffers_comp)) { 378 spin_lock(&ctx->completion_lock); 379 if (!list_empty(&ctx->io_buffers_comp)) { 380 list_splice_init(&ctx->io_buffers_comp, 381 &ctx->io_buffers_cache); 382 spin_unlock(&ctx->completion_lock); 383 return 0; 384 } 385 spin_unlock(&ctx->completion_lock); 386 } 387 388 /* 389 * No free buffers and no completion entries either. Allocate a new 390 * batch of buffer entries and add those to our freelist. 391 */ 392 393 allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT, 394 ARRAY_SIZE(bufs), (void **) bufs); 395 if (unlikely(!allocated)) { 396 /* 397 * Bulk alloc is all-or-nothing. If we fail to get a batch, 398 * retry single alloc to be on the safe side. 399 */ 400 bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL); 401 if (!bufs[0]) 402 return -ENOMEM; 403 allocated = 1; 404 } 405 406 while (allocated) 407 list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache); 408 409 return 0; 410 } 411 412 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, 413 struct io_buffer_list *bl) 414 { 415 struct io_buffer *buf; 416 u64 addr = pbuf->addr; 417 int i, bid = pbuf->bid; 418 419 for (i = 0; i < pbuf->nbufs; i++) { 420 if (list_empty(&ctx->io_buffers_cache) && 421 io_refill_buffer_cache(ctx)) 422 break; 423 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, 424 list); 425 list_move_tail(&buf->list, &bl->buf_list); 426 buf->addr = addr; 427 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 428 buf->bid = bid; 429 buf->bgid = pbuf->bgid; 430 addr += pbuf->len; 431 bid++; 432 cond_resched(); 433 } 434 435 return i ? 0 : -ENOMEM; 436 } 437 438 int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 439 { 440 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 441 struct io_ring_ctx *ctx = req->ctx; 442 struct io_buffer_list *bl; 443 int ret = 0; 444 445 io_ring_submit_lock(ctx, issue_flags); 446 447 if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { 448 ret = io_init_bl_list(ctx); 449 if (ret) 450 goto err; 451 } 452 453 bl = io_buffer_get_list(ctx, p->bgid); 454 if (unlikely(!bl)) { 455 bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); 456 if (!bl) { 457 ret = -ENOMEM; 458 goto err; 459 } 460 INIT_LIST_HEAD(&bl->buf_list); 461 ret = io_buffer_add_list(ctx, bl, p->bgid); 462 if (ret) { 463 kfree(bl); 464 goto err; 465 } 466 } 467 /* can't add buffers via this command for a mapped buffer ring */ 468 if (bl->is_mapped) { 469 ret = -EINVAL; 470 goto err; 471 } 472 473 ret = io_add_buffers(ctx, p, bl); 474 err: 475 io_ring_submit_unlock(ctx, issue_flags); 476 477 if (ret < 0) 478 req_set_fail(req); 479 io_req_set_res(req, ret, 0); 480 return IOU_OK; 481 } 482 483 static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, 484 struct io_buffer_list *bl) 485 { 486 struct io_uring_buf_ring *br; 487 struct page **pages; 488 int i, nr_pages; 489 490 pages = io_pin_pages(reg->ring_addr, 491 flex_array_size(br, bufs, reg->ring_entries), 492 &nr_pages); 493 if (IS_ERR(pages)) 494 return PTR_ERR(pages); 495 496 /* 497 * Apparently some 32-bit boxes (ARM) will return highmem pages, 498 * which then need to be mapped. We could support that, but it'd 499 * complicate the code and slowdown the common cases quite a bit. 500 * So just error out, returning -EINVAL just like we did on kernels 501 * that didn't support mapped buffer rings. 502 */ 503 for (i = 0; i < nr_pages; i++) 504 if (PageHighMem(pages[i])) 505 goto error_unpin; 506 507 br = page_address(pages[0]); 508 #ifdef SHM_COLOUR 509 /* 510 * On platforms that have specific aliasing requirements, SHM_COLOUR 511 * is set and we must guarantee that the kernel and user side align 512 * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and 513 * the application mmap's the provided ring buffer. Fail the request 514 * if we, by chance, don't end up with aligned addresses. The app 515 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle 516 * this transparently. 517 */ 518 if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) 519 goto error_unpin; 520 #endif 521 bl->buf_pages = pages; 522 bl->buf_nr_pages = nr_pages; 523 bl->buf_ring = br; 524 bl->is_mapped = 1; 525 bl->is_mmap = 0; 526 return 0; 527 error_unpin: 528 for (i = 0; i < nr_pages; i++) 529 unpin_user_page(pages[i]); 530 kvfree(pages); 531 return -EINVAL; 532 } 533 534 static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg, 535 struct io_buffer_list *bl) 536 { 537 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; 538 size_t ring_size; 539 void *ptr; 540 541 ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); 542 ptr = (void *) __get_free_pages(gfp, get_order(ring_size)); 543 if (!ptr) 544 return -ENOMEM; 545 546 bl->buf_ring = ptr; 547 bl->is_mapped = 1; 548 bl->is_mmap = 1; 549 return 0; 550 } 551 552 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 553 { 554 struct io_uring_buf_reg reg; 555 struct io_buffer_list *bl, *free_bl = NULL; 556 int ret; 557 558 if (copy_from_user(®, arg, sizeof(reg))) 559 return -EFAULT; 560 561 if (reg.resv[0] || reg.resv[1] || reg.resv[2]) 562 return -EINVAL; 563 if (reg.flags & ~IOU_PBUF_RING_MMAP) 564 return -EINVAL; 565 if (!(reg.flags & IOU_PBUF_RING_MMAP)) { 566 if (!reg.ring_addr) 567 return -EFAULT; 568 if (reg.ring_addr & ~PAGE_MASK) 569 return -EINVAL; 570 } else { 571 if (reg.ring_addr) 572 return -EINVAL; 573 } 574 575 if (!is_power_of_2(reg.ring_entries)) 576 return -EINVAL; 577 578 /* cannot disambiguate full vs empty due to head/tail size */ 579 if (reg.ring_entries >= 65536) 580 return -EINVAL; 581 582 if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { 583 int ret = io_init_bl_list(ctx); 584 if (ret) 585 return ret; 586 } 587 588 bl = io_buffer_get_list(ctx, reg.bgid); 589 if (bl) { 590 /* if mapped buffer ring OR classic exists, don't allow */ 591 if (bl->is_mapped || !list_empty(&bl->buf_list)) 592 return -EEXIST; 593 } else { 594 free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); 595 if (!bl) 596 return -ENOMEM; 597 } 598 599 if (!(reg.flags & IOU_PBUF_RING_MMAP)) 600 ret = io_pin_pbuf_ring(®, bl); 601 else 602 ret = io_alloc_pbuf_ring(®, bl); 603 604 if (!ret) { 605 bl->nr_entries = reg.ring_entries; 606 bl->mask = reg.ring_entries - 1; 607 608 io_buffer_add_list(ctx, bl, reg.bgid); 609 return 0; 610 } 611 612 kfree(free_bl); 613 return ret; 614 } 615 616 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 617 { 618 struct io_uring_buf_reg reg; 619 struct io_buffer_list *bl; 620 621 if (copy_from_user(®, arg, sizeof(reg))) 622 return -EFAULT; 623 if (reg.resv[0] || reg.resv[1] || reg.resv[2]) 624 return -EINVAL; 625 if (reg.flags) 626 return -EINVAL; 627 628 bl = io_buffer_get_list(ctx, reg.bgid); 629 if (!bl) 630 return -ENOENT; 631 if (!bl->is_mapped) 632 return -EINVAL; 633 634 __io_remove_buffers(ctx, bl, -1U); 635 if (bl->bgid >= BGID_ARRAY) { 636 xa_erase(&ctx->io_bl_xa, bl->bgid); 637 kfree(bl); 638 } 639 return 0; 640 } 641 642 void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) 643 { 644 struct io_buffer_list *bl; 645 646 bl = io_buffer_get_list(ctx, bgid); 647 if (!bl || !bl->is_mmap) 648 return NULL; 649 650 return bl->buf_ring; 651 } 652