1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/namei.h> 9 #include <linux/poll.h> 10 #include <linux/io_uring.h> 11 12 #include <uapi/linux/io_uring.h> 13 14 #include "io_uring.h" 15 #include "opdef.h" 16 #include "kbuf.h" 17 18 #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) 19 20 #define BGID_ARRAY 64 21 22 struct io_provide_buf { 23 struct file *file; 24 __u64 addr; 25 __u32 len; 26 __u32 bgid; 27 __u16 nbufs; 28 __u16 bid; 29 }; 30 31 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, 32 unsigned int bgid) 33 { 34 if (ctx->io_bl && bgid < BGID_ARRAY) 35 return &ctx->io_bl[bgid]; 36 37 return xa_load(&ctx->io_bl_xa, bgid); 38 } 39 40 static int io_buffer_add_list(struct io_ring_ctx *ctx, 41 struct io_buffer_list *bl, unsigned int bgid) 42 { 43 bl->bgid = bgid; 44 if (bgid < BGID_ARRAY) 45 return 0; 46 47 return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); 48 } 49 50 void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) 51 { 52 struct io_ring_ctx *ctx = req->ctx; 53 struct io_buffer_list *bl; 54 struct io_buffer *buf; 55 56 /* 57 * For legacy provided buffer mode, don't recycle if we already did 58 * IO to this buffer. For ring-mapped provided buffer mode, we should 59 * increment ring->head to explicitly monopolize the buffer to avoid 60 * multiple use. 61 */ 62 if (req->flags & REQ_F_PARTIAL_IO) 63 return; 64 65 io_ring_submit_lock(ctx, issue_flags); 66 67 buf = req->kbuf; 68 bl = io_buffer_get_list(ctx, buf->bgid); 69 list_add(&buf->list, &bl->buf_list); 70 req->flags &= ~REQ_F_BUFFER_SELECTED; 71 req->buf_index = buf->bgid; 72 73 io_ring_submit_unlock(ctx, issue_flags); 74 return; 75 } 76 77 unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) 78 { 79 unsigned int cflags; 80 81 /* 82 * We can add this buffer back to two lists: 83 * 84 * 1) The io_buffers_cache list. This one is protected by the 85 * ctx->uring_lock. If we already hold this lock, add back to this 86 * list as we can grab it from issue as well. 87 * 2) The io_buffers_comp list. This one is protected by the 88 * ctx->completion_lock. 89 * 90 * We migrate buffers from the comp_list to the issue cache list 91 * when we need one. 92 */ 93 if (req->flags & REQ_F_BUFFER_RING) { 94 /* no buffers to recycle for this case */ 95 cflags = __io_put_kbuf_list(req, NULL); 96 } else if (issue_flags & IO_URING_F_UNLOCKED) { 97 struct io_ring_ctx *ctx = req->ctx; 98 99 spin_lock(&ctx->completion_lock); 100 cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp); 101 spin_unlock(&ctx->completion_lock); 102 } else { 103 lockdep_assert_held(&req->ctx->uring_lock); 104 105 cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); 106 } 107 return cflags; 108 } 109 110 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, 111 struct io_buffer_list *bl) 112 { 113 if (!list_empty(&bl->buf_list)) { 114 struct io_buffer *kbuf; 115 116 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); 117 list_del(&kbuf->list); 118 if (*len == 0 || *len > kbuf->len) 119 *len = kbuf->len; 120 req->flags |= REQ_F_BUFFER_SELECTED; 121 req->kbuf = kbuf; 122 req->buf_index = kbuf->bid; 123 return u64_to_user_ptr(kbuf->addr); 124 } 125 return NULL; 126 } 127 128 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, 129 struct io_buffer_list *bl, 130 unsigned int issue_flags) 131 { 132 struct io_uring_buf_ring *br = bl->buf_ring; 133 struct io_uring_buf *buf; 134 __u16 head = bl->head; 135 136 if (unlikely(smp_load_acquire(&br->tail) == head)) 137 return NULL; 138 139 head &= bl->mask; 140 /* mmaped buffers are always contig */ 141 if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { 142 buf = &br->bufs[head]; 143 } else { 144 int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); 145 int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; 146 buf = page_address(bl->buf_pages[index]); 147 buf += off; 148 } 149 if (*len == 0 || *len > buf->len) 150 *len = buf->len; 151 req->flags |= REQ_F_BUFFER_RING; 152 req->buf_list = bl; 153 req->buf_index = buf->bid; 154 155 if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { 156 /* 157 * If we came in unlocked, we have no choice but to consume the 158 * buffer here, otherwise nothing ensures that the buffer won't 159 * get used by others. This does mean it'll be pinned until the 160 * IO completes, coming in unlocked means we're being called from 161 * io-wq context and there may be further retries in async hybrid 162 * mode. For the locked case, the caller must call commit when 163 * the transfer completes (or if we get -EAGAIN and must poll of 164 * retry). 165 */ 166 req->buf_list = NULL; 167 bl->head++; 168 } 169 return u64_to_user_ptr(buf->addr); 170 } 171 172 void __user *io_buffer_select(struct io_kiocb *req, size_t *len, 173 unsigned int issue_flags) 174 { 175 struct io_ring_ctx *ctx = req->ctx; 176 struct io_buffer_list *bl; 177 void __user *ret = NULL; 178 179 io_ring_submit_lock(req->ctx, issue_flags); 180 181 bl = io_buffer_get_list(ctx, req->buf_index); 182 if (likely(bl)) { 183 if (bl->is_mapped) 184 ret = io_ring_buffer_select(req, len, bl, issue_flags); 185 else 186 ret = io_provided_buffer_select(req, len, bl); 187 } 188 io_ring_submit_unlock(req->ctx, issue_flags); 189 return ret; 190 } 191 192 static __cold int io_init_bl_list(struct io_ring_ctx *ctx) 193 { 194 int i; 195 196 ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), 197 GFP_KERNEL); 198 if (!ctx->io_bl) 199 return -ENOMEM; 200 201 for (i = 0; i < BGID_ARRAY; i++) { 202 INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); 203 ctx->io_bl[i].bgid = i; 204 } 205 206 return 0; 207 } 208 209 static int __io_remove_buffers(struct io_ring_ctx *ctx, 210 struct io_buffer_list *bl, unsigned nbufs) 211 { 212 unsigned i = 0; 213 214 /* shouldn't happen */ 215 if (!nbufs) 216 return 0; 217 218 if (bl->is_mapped) { 219 i = bl->buf_ring->tail - bl->head; 220 if (bl->is_mmap) { 221 folio_put(virt_to_folio(bl->buf_ring)); 222 bl->buf_ring = NULL; 223 bl->is_mmap = 0; 224 } else if (bl->buf_nr_pages) { 225 int j; 226 227 for (j = 0; j < bl->buf_nr_pages; j++) 228 unpin_user_page(bl->buf_pages[j]); 229 kvfree(bl->buf_pages); 230 bl->buf_pages = NULL; 231 bl->buf_nr_pages = 0; 232 } 233 /* make sure it's seen as empty */ 234 INIT_LIST_HEAD(&bl->buf_list); 235 bl->is_mapped = 0; 236 return i; 237 } 238 239 /* protects io_buffers_cache */ 240 lockdep_assert_held(&ctx->uring_lock); 241 242 while (!list_empty(&bl->buf_list)) { 243 struct io_buffer *nxt; 244 245 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); 246 list_move(&nxt->list, &ctx->io_buffers_cache); 247 if (++i == nbufs) 248 return i; 249 cond_resched(); 250 } 251 252 return i; 253 } 254 255 void io_destroy_buffers(struct io_ring_ctx *ctx) 256 { 257 struct io_buffer_list *bl; 258 unsigned long index; 259 int i; 260 261 for (i = 0; i < BGID_ARRAY; i++) { 262 if (!ctx->io_bl) 263 break; 264 __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); 265 } 266 267 xa_for_each(&ctx->io_bl_xa, index, bl) { 268 xa_erase(&ctx->io_bl_xa, bl->bgid); 269 __io_remove_buffers(ctx, bl, -1U); 270 kfree(bl); 271 } 272 273 while (!list_empty(&ctx->io_buffers_pages)) { 274 struct page *page; 275 276 page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); 277 list_del_init(&page->lru); 278 __free_page(page); 279 } 280 } 281 282 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 283 { 284 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 285 u64 tmp; 286 287 if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 288 sqe->splice_fd_in) 289 return -EINVAL; 290 291 tmp = READ_ONCE(sqe->fd); 292 if (!tmp || tmp > USHRT_MAX) 293 return -EINVAL; 294 295 memset(p, 0, sizeof(*p)); 296 p->nbufs = tmp; 297 p->bgid = READ_ONCE(sqe->buf_group); 298 return 0; 299 } 300 301 int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) 302 { 303 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 304 struct io_ring_ctx *ctx = req->ctx; 305 struct io_buffer_list *bl; 306 int ret = 0; 307 308 io_ring_submit_lock(ctx, issue_flags); 309 310 ret = -ENOENT; 311 bl = io_buffer_get_list(ctx, p->bgid); 312 if (bl) { 313 ret = -EINVAL; 314 /* can't use provide/remove buffers command on mapped buffers */ 315 if (!bl->is_mapped) 316 ret = __io_remove_buffers(ctx, bl, p->nbufs); 317 } 318 io_ring_submit_unlock(ctx, issue_flags); 319 if (ret < 0) 320 req_set_fail(req); 321 io_req_set_res(req, ret, 0); 322 return IOU_OK; 323 } 324 325 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 326 { 327 unsigned long size, tmp_check; 328 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 329 u64 tmp; 330 331 if (sqe->rw_flags || sqe->splice_fd_in) 332 return -EINVAL; 333 334 tmp = READ_ONCE(sqe->fd); 335 if (!tmp || tmp > USHRT_MAX) 336 return -E2BIG; 337 p->nbufs = tmp; 338 p->addr = READ_ONCE(sqe->addr); 339 p->len = READ_ONCE(sqe->len); 340 341 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, 342 &size)) 343 return -EOVERFLOW; 344 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) 345 return -EOVERFLOW; 346 347 size = (unsigned long)p->len * p->nbufs; 348 if (!access_ok(u64_to_user_ptr(p->addr), size)) 349 return -EFAULT; 350 351 p->bgid = READ_ONCE(sqe->buf_group); 352 tmp = READ_ONCE(sqe->off); 353 if (tmp > USHRT_MAX) 354 return -E2BIG; 355 if (tmp + p->nbufs >= USHRT_MAX) 356 return -EINVAL; 357 p->bid = tmp; 358 return 0; 359 } 360 361 static int io_refill_buffer_cache(struct io_ring_ctx *ctx) 362 { 363 struct io_buffer *buf; 364 struct page *page; 365 int bufs_in_page; 366 367 /* 368 * Completions that don't happen inline (eg not under uring_lock) will 369 * add to ->io_buffers_comp. If we don't have any free buffers, check 370 * the completion list and splice those entries first. 371 */ 372 if (!list_empty_careful(&ctx->io_buffers_comp)) { 373 spin_lock(&ctx->completion_lock); 374 if (!list_empty(&ctx->io_buffers_comp)) { 375 list_splice_init(&ctx->io_buffers_comp, 376 &ctx->io_buffers_cache); 377 spin_unlock(&ctx->completion_lock); 378 return 0; 379 } 380 spin_unlock(&ctx->completion_lock); 381 } 382 383 /* 384 * No free buffers and no completion entries either. Allocate a new 385 * page worth of buffer entries and add those to our freelist. 386 */ 387 page = alloc_page(GFP_KERNEL_ACCOUNT); 388 if (!page) 389 return -ENOMEM; 390 391 list_add(&page->lru, &ctx->io_buffers_pages); 392 393 buf = page_address(page); 394 bufs_in_page = PAGE_SIZE / sizeof(*buf); 395 while (bufs_in_page) { 396 list_add_tail(&buf->list, &ctx->io_buffers_cache); 397 buf++; 398 bufs_in_page--; 399 } 400 401 return 0; 402 } 403 404 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, 405 struct io_buffer_list *bl) 406 { 407 struct io_buffer *buf; 408 u64 addr = pbuf->addr; 409 int i, bid = pbuf->bid; 410 411 for (i = 0; i < pbuf->nbufs; i++) { 412 if (list_empty(&ctx->io_buffers_cache) && 413 io_refill_buffer_cache(ctx)) 414 break; 415 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, 416 list); 417 list_move_tail(&buf->list, &bl->buf_list); 418 buf->addr = addr; 419 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 420 buf->bid = bid; 421 buf->bgid = pbuf->bgid; 422 addr += pbuf->len; 423 bid++; 424 cond_resched(); 425 } 426 427 return i ? 0 : -ENOMEM; 428 } 429 430 int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 431 { 432 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); 433 struct io_ring_ctx *ctx = req->ctx; 434 struct io_buffer_list *bl; 435 int ret = 0; 436 437 io_ring_submit_lock(ctx, issue_flags); 438 439 if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { 440 ret = io_init_bl_list(ctx); 441 if (ret) 442 goto err; 443 } 444 445 bl = io_buffer_get_list(ctx, p->bgid); 446 if (unlikely(!bl)) { 447 bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); 448 if (!bl) { 449 ret = -ENOMEM; 450 goto err; 451 } 452 INIT_LIST_HEAD(&bl->buf_list); 453 ret = io_buffer_add_list(ctx, bl, p->bgid); 454 if (ret) { 455 kfree(bl); 456 goto err; 457 } 458 } 459 /* can't add buffers via this command for a mapped buffer ring */ 460 if (bl->is_mapped) { 461 ret = -EINVAL; 462 goto err; 463 } 464 465 ret = io_add_buffers(ctx, p, bl); 466 err: 467 io_ring_submit_unlock(ctx, issue_flags); 468 469 if (ret < 0) 470 req_set_fail(req); 471 io_req_set_res(req, ret, 0); 472 return IOU_OK; 473 } 474 475 static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, 476 struct io_buffer_list *bl) 477 { 478 struct io_uring_buf_ring *br; 479 struct page **pages; 480 int i, nr_pages; 481 482 pages = io_pin_pages(reg->ring_addr, 483 flex_array_size(br, bufs, reg->ring_entries), 484 &nr_pages); 485 if (IS_ERR(pages)) 486 return PTR_ERR(pages); 487 488 /* 489 * Apparently some 32-bit boxes (ARM) will return highmem pages, 490 * which then need to be mapped. We could support that, but it'd 491 * complicate the code and slowdown the common cases quite a bit. 492 * So just error out, returning -EINVAL just like we did on kernels 493 * that didn't support mapped buffer rings. 494 */ 495 for (i = 0; i < nr_pages; i++) 496 if (PageHighMem(pages[i])) 497 goto error_unpin; 498 499 br = page_address(pages[0]); 500 #ifdef SHM_COLOUR 501 /* 502 * On platforms that have specific aliasing requirements, SHM_COLOUR 503 * is set and we must guarantee that the kernel and user side align 504 * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and 505 * the application mmap's the provided ring buffer. Fail the request 506 * if we, by chance, don't end up with aligned addresses. The app 507 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle 508 * this transparently. 509 */ 510 if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) 511 goto error_unpin; 512 #endif 513 bl->buf_pages = pages; 514 bl->buf_nr_pages = nr_pages; 515 bl->buf_ring = br; 516 bl->is_mapped = 1; 517 bl->is_mmap = 0; 518 return 0; 519 error_unpin: 520 for (i = 0; i < nr_pages; i++) 521 unpin_user_page(pages[i]); 522 kvfree(pages); 523 return -EINVAL; 524 } 525 526 static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg, 527 struct io_buffer_list *bl) 528 { 529 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; 530 size_t ring_size; 531 void *ptr; 532 533 ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); 534 ptr = (void *) __get_free_pages(gfp, get_order(ring_size)); 535 if (!ptr) 536 return -ENOMEM; 537 538 bl->buf_ring = ptr; 539 bl->is_mapped = 1; 540 bl->is_mmap = 1; 541 return 0; 542 } 543 544 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 545 { 546 struct io_uring_buf_reg reg; 547 struct io_buffer_list *bl, *free_bl = NULL; 548 int ret; 549 550 if (copy_from_user(®, arg, sizeof(reg))) 551 return -EFAULT; 552 553 if (reg.resv[0] || reg.resv[1] || reg.resv[2]) 554 return -EINVAL; 555 if (reg.flags & ~IOU_PBUF_RING_MMAP) 556 return -EINVAL; 557 if (!(reg.flags & IOU_PBUF_RING_MMAP)) { 558 if (!reg.ring_addr) 559 return -EFAULT; 560 if (reg.ring_addr & ~PAGE_MASK) 561 return -EINVAL; 562 } else { 563 if (reg.ring_addr) 564 return -EINVAL; 565 } 566 567 if (!is_power_of_2(reg.ring_entries)) 568 return -EINVAL; 569 570 /* cannot disambiguate full vs empty due to head/tail size */ 571 if (reg.ring_entries >= 65536) 572 return -EINVAL; 573 574 if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { 575 int ret = io_init_bl_list(ctx); 576 if (ret) 577 return ret; 578 } 579 580 bl = io_buffer_get_list(ctx, reg.bgid); 581 if (bl) { 582 /* if mapped buffer ring OR classic exists, don't allow */ 583 if (bl->is_mapped || !list_empty(&bl->buf_list)) 584 return -EEXIST; 585 } else { 586 free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); 587 if (!bl) 588 return -ENOMEM; 589 } 590 591 if (!(reg.flags & IOU_PBUF_RING_MMAP)) 592 ret = io_pin_pbuf_ring(®, bl); 593 else 594 ret = io_alloc_pbuf_ring(®, bl); 595 596 if (!ret) { 597 bl->nr_entries = reg.ring_entries; 598 bl->mask = reg.ring_entries - 1; 599 600 io_buffer_add_list(ctx, bl, reg.bgid); 601 return 0; 602 } 603 604 kfree(free_bl); 605 return ret; 606 } 607 608 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 609 { 610 struct io_uring_buf_reg reg; 611 struct io_buffer_list *bl; 612 613 if (copy_from_user(®, arg, sizeof(reg))) 614 return -EFAULT; 615 if (reg.resv[0] || reg.resv[1] || reg.resv[2]) 616 return -EINVAL; 617 if (reg.flags) 618 return -EINVAL; 619 620 bl = io_buffer_get_list(ctx, reg.bgid); 621 if (!bl) 622 return -ENOENT; 623 if (!bl->is_mapped) 624 return -EINVAL; 625 626 __io_remove_buffers(ctx, bl, -1U); 627 if (bl->bgid >= BGID_ARRAY) { 628 xa_erase(&ctx->io_bl_xa, bl->bgid); 629 kfree(bl); 630 } 631 return 0; 632 } 633 634 void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) 635 { 636 struct io_buffer_list *bl; 637 638 bl = io_buffer_get_list(ctx, bgid); 639 if (!bl || !bl->is_mmap) 640 return NULL; 641 642 return bl->buf_ring; 643 } 644