1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/dma-map-ops.h> 5 #include <linux/mm.h> 6 #include <linux/nospec.h> 7 #include <linux/io_uring.h> 8 #include <linux/netdevice.h> 9 #include <linux/rtnetlink.h> 10 #include <linux/skbuff_ref.h> 11 12 #include <net/page_pool/helpers.h> 13 #include <net/page_pool/memory_provider.h> 14 #include <net/netlink.h> 15 #include <net/netdev_rx_queue.h> 16 #include <net/tcp.h> 17 #include <net/rps.h> 18 19 #include <trace/events/page_pool.h> 20 21 #include <uapi/linux/io_uring.h> 22 23 #include "io_uring.h" 24 #include "kbuf.h" 25 #include "memmap.h" 26 #include "zcrx.h" 27 #include "rsrc.h" 28 29 #define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) 30 31 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) 32 33 static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) 34 { 35 return pp->mp_priv; 36 } 37 38 static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) 39 { 40 struct net_iov_area *owner = net_iov_owner(niov); 41 42 return container_of(owner, struct io_zcrx_area, nia); 43 } 44 45 static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) 46 { 47 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 48 unsigned niov_pages_shift; 49 50 lockdep_assert(!area->mem.is_dmabuf); 51 52 niov_pages_shift = area->ifq->niov_shift - PAGE_SHIFT; 53 return area->mem.pages[net_iov_idx(niov) << niov_pages_shift]; 54 } 55 56 static int io_populate_area_dma(struct io_zcrx_ifq *ifq, 57 struct io_zcrx_area *area) 58 { 59 unsigned niov_size = 1U << ifq->niov_shift; 60 struct sg_table *sgt = area->mem.sgt; 61 struct scatterlist *sg; 62 unsigned i, niov_idx = 0; 63 64 for_each_sgtable_dma_sg(sgt, sg, i) { 65 dma_addr_t dma = sg_dma_address(sg); 66 unsigned long sg_len = sg_dma_len(sg); 67 68 if (WARN_ON_ONCE(sg_len % niov_size)) 69 return -EINVAL; 70 71 while (sg_len && niov_idx < area->nia.num_niovs) { 72 struct net_iov *niov = &area->nia.niovs[niov_idx]; 73 74 if (net_mp_niov_set_dma_addr(niov, dma)) 75 return -EFAULT; 76 sg_len -= niov_size; 77 dma += niov_size; 78 niov_idx++; 79 } 80 } 81 82 if (WARN_ON_ONCE(niov_idx != area->nia.num_niovs)) 83 return -EFAULT; 84 return 0; 85 } 86 87 static void io_release_dmabuf(struct io_zcrx_mem *mem) 88 { 89 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 90 return; 91 92 if (mem->sgt) 93 dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, 94 DMA_FROM_DEVICE); 95 if (mem->attach) 96 dma_buf_detach(mem->dmabuf, mem->attach); 97 if (mem->dmabuf) 98 dma_buf_put(mem->dmabuf); 99 100 mem->sgt = NULL; 101 mem->attach = NULL; 102 mem->dmabuf = NULL; 103 } 104 105 static int io_import_dmabuf(struct io_zcrx_ifq *ifq, 106 struct io_zcrx_mem *mem, 107 struct io_uring_zcrx_area_reg *area_reg) 108 { 109 unsigned long off = (unsigned long)area_reg->addr; 110 unsigned long len = (unsigned long)area_reg->len; 111 unsigned long total_size = 0; 112 struct scatterlist *sg; 113 int dmabuf_fd = area_reg->dmabuf_fd; 114 int i, ret; 115 116 if (off) 117 return -EINVAL; 118 if (WARN_ON_ONCE(!ifq->dev)) 119 return -EFAULT; 120 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 121 return -EINVAL; 122 123 mem->is_dmabuf = true; 124 mem->dmabuf = dma_buf_get(dmabuf_fd); 125 if (IS_ERR(mem->dmabuf)) { 126 ret = PTR_ERR(mem->dmabuf); 127 mem->dmabuf = NULL; 128 goto err; 129 } 130 131 mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); 132 if (IS_ERR(mem->attach)) { 133 ret = PTR_ERR(mem->attach); 134 mem->attach = NULL; 135 goto err; 136 } 137 138 mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); 139 if (IS_ERR(mem->sgt)) { 140 ret = PTR_ERR(mem->sgt); 141 mem->sgt = NULL; 142 goto err; 143 } 144 145 for_each_sgtable_dma_sg(mem->sgt, sg, i) 146 total_size += sg_dma_len(sg); 147 148 if (total_size != len) { 149 ret = -EINVAL; 150 goto err; 151 } 152 153 mem->size = len; 154 return 0; 155 err: 156 io_release_dmabuf(mem); 157 return ret; 158 } 159 160 static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages) 161 { 162 struct folio *last_folio = NULL; 163 unsigned long res = 0; 164 int i; 165 166 for (i = 0; i < nr_pages; i++) { 167 struct folio *folio = page_folio(pages[i]); 168 169 if (folio == last_folio) 170 continue; 171 last_folio = folio; 172 res += 1UL << folio_order(folio); 173 } 174 return res; 175 } 176 177 static int io_import_umem(struct io_zcrx_ifq *ifq, 178 struct io_zcrx_mem *mem, 179 struct io_uring_zcrx_area_reg *area_reg) 180 { 181 struct page **pages; 182 int nr_pages, ret; 183 184 if (area_reg->dmabuf_fd) 185 return -EINVAL; 186 if (!area_reg->addr) 187 return -EFAULT; 188 pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, 189 &nr_pages); 190 if (IS_ERR(pages)) 191 return PTR_ERR(pages); 192 193 ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages, 194 0, nr_pages << PAGE_SHIFT, 195 GFP_KERNEL_ACCOUNT); 196 if (ret) { 197 unpin_user_pages(pages, nr_pages); 198 return ret; 199 } 200 201 mem->account_pages = io_count_account_pages(pages, nr_pages); 202 ret = io_account_mem(ifq->ctx, mem->account_pages); 203 if (ret < 0) 204 mem->account_pages = 0; 205 206 mem->sgt = &mem->page_sg_table; 207 mem->pages = pages; 208 mem->nr_folios = nr_pages; 209 mem->size = area_reg->len; 210 return ret; 211 } 212 213 static void io_release_area_mem(struct io_zcrx_mem *mem) 214 { 215 if (mem->is_dmabuf) { 216 io_release_dmabuf(mem); 217 return; 218 } 219 if (mem->pages) { 220 unpin_user_pages(mem->pages, mem->nr_folios); 221 sg_free_table(mem->sgt); 222 mem->sgt = NULL; 223 kvfree(mem->pages); 224 } 225 } 226 227 static int io_import_area(struct io_zcrx_ifq *ifq, 228 struct io_zcrx_mem *mem, 229 struct io_uring_zcrx_area_reg *area_reg) 230 { 231 int ret; 232 233 if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) 234 return -EINVAL; 235 if (area_reg->rq_area_token) 236 return -EINVAL; 237 if (area_reg->__resv2[0] || area_reg->__resv2[1]) 238 return -EINVAL; 239 240 ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); 241 if (ret) 242 return ret; 243 if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) 244 return -EINVAL; 245 246 if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) 247 return io_import_dmabuf(ifq, mem, area_reg); 248 return io_import_umem(ifq, mem, area_reg); 249 } 250 251 static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, 252 struct io_zcrx_area *area) 253 { 254 int i; 255 256 guard(mutex)(&ifq->pp_lock); 257 if (!area->is_mapped) 258 return; 259 area->is_mapped = false; 260 261 for (i = 0; i < area->nia.num_niovs; i++) 262 net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); 263 264 if (area->mem.is_dmabuf) { 265 io_release_dmabuf(&area->mem); 266 } else { 267 dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table, 268 DMA_FROM_DEVICE, IO_DMA_ATTR); 269 } 270 } 271 272 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 273 { 274 int ret; 275 276 guard(mutex)(&ifq->pp_lock); 277 if (area->is_mapped) 278 return 0; 279 280 if (!area->mem.is_dmabuf) { 281 ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table, 282 DMA_FROM_DEVICE, IO_DMA_ATTR); 283 if (ret < 0) 284 return ret; 285 } 286 287 ret = io_populate_area_dma(ifq, area); 288 if (ret == 0) 289 area->is_mapped = true; 290 return ret; 291 } 292 293 static void io_zcrx_sync_for_device(struct page_pool *pool, 294 struct net_iov *niov) 295 { 296 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 297 dma_addr_t dma_addr; 298 299 unsigned niov_size; 300 301 if (!dma_dev_need_sync(pool->p.dev)) 302 return; 303 304 niov_size = 1U << io_pp_to_ifq(pool)->niov_shift; 305 dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); 306 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, 307 niov_size, pool->p.dma_dir); 308 #endif 309 } 310 311 #define IO_RQ_MAX_ENTRIES 32768 312 313 #define IO_SKBS_PER_CALL_LIMIT 20 314 315 struct io_zcrx_args { 316 struct io_kiocb *req; 317 struct io_zcrx_ifq *ifq; 318 struct socket *sock; 319 unsigned nr_skbs; 320 }; 321 322 static const struct memory_provider_ops io_uring_pp_zc_ops; 323 324 static inline atomic_t *io_get_user_counter(struct net_iov *niov) 325 { 326 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 327 328 return &area->user_refs[net_iov_idx(niov)]; 329 } 330 331 static bool io_zcrx_put_niov_uref(struct net_iov *niov) 332 { 333 atomic_t *uref = io_get_user_counter(niov); 334 335 if (unlikely(!atomic_read(uref))) 336 return false; 337 atomic_dec(uref); 338 return true; 339 } 340 341 static void io_zcrx_get_niov_uref(struct net_iov *niov) 342 { 343 atomic_inc(io_get_user_counter(niov)); 344 } 345 346 static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, 347 struct io_uring_zcrx_ifq_reg *reg, 348 struct io_uring_region_desc *rd, 349 u32 id) 350 { 351 u64 mmap_offset; 352 size_t off, size; 353 void *ptr; 354 int ret; 355 356 off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); 357 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 358 if (size > rd->size) 359 return -EINVAL; 360 361 mmap_offset = IORING_MAP_OFF_ZCRX_REGION; 362 mmap_offset += id << IORING_OFF_PBUF_SHIFT; 363 364 ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); 365 if (ret < 0) 366 return ret; 367 368 ptr = io_region_get_ptr(&ifq->region); 369 ifq->rq_ring = (struct io_uring *)ptr; 370 ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 371 372 reg->offsets.head = offsetof(struct io_uring, head); 373 reg->offsets.tail = offsetof(struct io_uring, tail); 374 reg->offsets.rqes = off; 375 return 0; 376 } 377 378 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) 379 { 380 io_free_region(ifq->ctx, &ifq->region); 381 ifq->rq_ring = NULL; 382 ifq->rqes = NULL; 383 } 384 385 static void io_zcrx_free_area(struct io_zcrx_area *area) 386 { 387 io_zcrx_unmap_area(area->ifq, area); 388 io_release_area_mem(&area->mem); 389 390 if (area->mem.account_pages) 391 io_unaccount_mem(area->ifq->ctx, area->mem.account_pages); 392 393 kvfree(area->freelist); 394 kvfree(area->nia.niovs); 395 kvfree(area->user_refs); 396 kfree(area); 397 } 398 399 static int io_zcrx_append_area(struct io_zcrx_ifq *ifq, 400 struct io_zcrx_area *area) 401 { 402 if (ifq->area) 403 return -EINVAL; 404 ifq->area = area; 405 return 0; 406 } 407 408 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, 409 struct io_uring_zcrx_area_reg *area_reg) 410 { 411 struct io_zcrx_area *area; 412 unsigned nr_iovs; 413 int i, ret; 414 415 ret = -ENOMEM; 416 area = kzalloc(sizeof(*area), GFP_KERNEL); 417 if (!area) 418 goto err; 419 area->ifq = ifq; 420 421 ret = io_import_area(ifq, &area->mem, area_reg); 422 if (ret) 423 goto err; 424 425 ifq->niov_shift = PAGE_SHIFT; 426 nr_iovs = area->mem.size >> ifq->niov_shift; 427 area->nia.num_niovs = nr_iovs; 428 429 ret = -ENOMEM; 430 area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), 431 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 432 if (!area->nia.niovs) 433 goto err; 434 435 area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), 436 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 437 if (!area->freelist) 438 goto err; 439 440 area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), 441 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 442 if (!area->user_refs) 443 goto err; 444 445 for (i = 0; i < nr_iovs; i++) { 446 struct net_iov *niov = &area->nia.niovs[i]; 447 448 niov->owner = &area->nia; 449 area->freelist[i] = i; 450 atomic_set(&area->user_refs[i], 0); 451 niov->type = NET_IOV_IOURING; 452 } 453 454 area->free_count = nr_iovs; 455 /* we're only supporting one area per ifq for now */ 456 area->area_id = 0; 457 area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; 458 spin_lock_init(&area->freelist_lock); 459 460 ret = io_zcrx_append_area(ifq, area); 461 if (!ret) 462 return 0; 463 err: 464 if (area) 465 io_zcrx_free_area(area); 466 return ret; 467 } 468 469 static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) 470 { 471 struct io_zcrx_ifq *ifq; 472 473 ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); 474 if (!ifq) 475 return NULL; 476 477 ifq->if_rxq = -1; 478 ifq->ctx = ctx; 479 spin_lock_init(&ifq->rq_lock); 480 mutex_init(&ifq->pp_lock); 481 return ifq; 482 } 483 484 static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) 485 { 486 guard(mutex)(&ifq->pp_lock); 487 488 if (!ifq->netdev) 489 return; 490 netdev_put(ifq->netdev, &ifq->netdev_tracker); 491 ifq->netdev = NULL; 492 } 493 494 static void io_close_queue(struct io_zcrx_ifq *ifq) 495 { 496 struct net_device *netdev; 497 netdevice_tracker netdev_tracker; 498 struct pp_memory_provider_params p = { 499 .mp_ops = &io_uring_pp_zc_ops, 500 .mp_priv = ifq, 501 }; 502 503 if (ifq->if_rxq == -1) 504 return; 505 506 scoped_guard(mutex, &ifq->pp_lock) { 507 netdev = ifq->netdev; 508 netdev_tracker = ifq->netdev_tracker; 509 ifq->netdev = NULL; 510 } 511 512 if (netdev) { 513 net_mp_close_rxq(netdev, ifq->if_rxq, &p); 514 netdev_put(netdev, &netdev_tracker); 515 } 516 ifq->if_rxq = -1; 517 } 518 519 static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) 520 { 521 io_close_queue(ifq); 522 523 if (ifq->area) 524 io_zcrx_free_area(ifq->area); 525 if (ifq->dev) 526 put_device(ifq->dev); 527 528 io_free_rbuf_ring(ifq); 529 mutex_destroy(&ifq->pp_lock); 530 kfree(ifq); 531 } 532 533 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, 534 unsigned int id) 535 { 536 struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); 537 538 lockdep_assert_held(&ctx->mmap_lock); 539 540 return ifq ? &ifq->region : NULL; 541 } 542 543 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 544 struct io_uring_zcrx_ifq_reg __user *arg) 545 { 546 struct pp_memory_provider_params mp_param = {}; 547 struct io_uring_zcrx_area_reg area; 548 struct io_uring_zcrx_ifq_reg reg; 549 struct io_uring_region_desc rd; 550 struct io_zcrx_ifq *ifq; 551 int ret; 552 u32 id; 553 554 /* 555 * 1. Interface queue allocation. 556 * 2. It can observe data destined for sockets of other tasks. 557 */ 558 if (!capable(CAP_NET_ADMIN)) 559 return -EPERM; 560 561 /* mandatory io_uring features for zc rx */ 562 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 563 return -EINVAL; 564 if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) 565 return -EINVAL; 566 if (copy_from_user(®, arg, sizeof(reg))) 567 return -EFAULT; 568 if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 569 return -EFAULT; 570 if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || 571 reg.__resv2 || reg.zcrx_id) 572 return -EINVAL; 573 if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) 574 return -EINVAL; 575 if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { 576 if (!(ctx->flags & IORING_SETUP_CLAMP)) 577 return -EINVAL; 578 reg.rq_entries = IO_RQ_MAX_ENTRIES; 579 } 580 reg.rq_entries = roundup_pow_of_two(reg.rq_entries); 581 582 if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) 583 return -EFAULT; 584 585 ifq = io_zcrx_ifq_alloc(ctx); 586 if (!ifq) 587 return -ENOMEM; 588 ifq->rq_entries = reg.rq_entries; 589 590 scoped_guard(mutex, &ctx->mmap_lock) { 591 /* preallocate id */ 592 ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); 593 if (ret) 594 goto ifq_free; 595 } 596 597 ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); 598 if (ret) 599 goto err; 600 601 ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, 602 &ifq->netdev_tracker, GFP_KERNEL); 603 if (!ifq->netdev) { 604 ret = -ENODEV; 605 goto err; 606 } 607 608 ifq->dev = ifq->netdev->dev.parent; 609 if (!ifq->dev) { 610 ret = -EOPNOTSUPP; 611 goto err; 612 } 613 get_device(ifq->dev); 614 615 ret = io_zcrx_create_area(ifq, &area); 616 if (ret) 617 goto err; 618 619 mp_param.mp_ops = &io_uring_pp_zc_ops; 620 mp_param.mp_priv = ifq; 621 ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); 622 if (ret) 623 goto err; 624 ifq->if_rxq = reg.if_rxq; 625 626 reg.zcrx_id = id; 627 628 scoped_guard(mutex, &ctx->mmap_lock) { 629 /* publish ifq */ 630 ret = -ENOMEM; 631 if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) 632 goto err; 633 } 634 635 if (copy_to_user(arg, ®, sizeof(reg)) || 636 copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || 637 copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { 638 ret = -EFAULT; 639 goto err; 640 } 641 return 0; 642 err: 643 scoped_guard(mutex, &ctx->mmap_lock) 644 xa_erase(&ctx->zcrx_ctxs, id); 645 ifq_free: 646 io_zcrx_ifq_free(ifq); 647 return ret; 648 } 649 650 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) 651 { 652 struct io_zcrx_ifq *ifq; 653 654 lockdep_assert_held(&ctx->uring_lock); 655 656 while (1) { 657 scoped_guard(mutex, &ctx->mmap_lock) { 658 unsigned long id = 0; 659 660 ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); 661 if (ifq) 662 xa_erase(&ctx->zcrx_ctxs, id); 663 } 664 if (!ifq) 665 break; 666 io_zcrx_ifq_free(ifq); 667 } 668 669 xa_destroy(&ctx->zcrx_ctxs); 670 } 671 672 static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) 673 { 674 unsigned niov_idx; 675 676 lockdep_assert_held(&area->freelist_lock); 677 678 niov_idx = area->freelist[--area->free_count]; 679 return &area->nia.niovs[niov_idx]; 680 } 681 682 static void io_zcrx_return_niov_freelist(struct net_iov *niov) 683 { 684 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 685 686 spin_lock_bh(&area->freelist_lock); 687 area->freelist[area->free_count++] = net_iov_idx(niov); 688 spin_unlock_bh(&area->freelist_lock); 689 } 690 691 static void io_zcrx_return_niov(struct net_iov *niov) 692 { 693 netmem_ref netmem = net_iov_to_netmem(niov); 694 695 if (!niov->pp) { 696 /* copy fallback allocated niovs */ 697 io_zcrx_return_niov_freelist(niov); 698 return; 699 } 700 page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); 701 } 702 703 static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 704 { 705 struct io_zcrx_area *area = ifq->area; 706 int i; 707 708 if (!area) 709 return; 710 711 /* Reclaim back all buffers given to the user space. */ 712 for (i = 0; i < area->nia.num_niovs; i++) { 713 struct net_iov *niov = &area->nia.niovs[i]; 714 int nr; 715 716 if (!atomic_read(io_get_user_counter(niov))) 717 continue; 718 nr = atomic_xchg(io_get_user_counter(niov), 0); 719 if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 720 io_zcrx_return_niov(niov); 721 } 722 } 723 724 void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) 725 { 726 struct io_zcrx_ifq *ifq; 727 unsigned long index; 728 729 lockdep_assert_held(&ctx->uring_lock); 730 731 xa_for_each(&ctx->zcrx_ctxs, index, ifq) { 732 io_zcrx_scrub(ifq); 733 io_close_queue(ifq); 734 } 735 } 736 737 static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) 738 { 739 u32 entries; 740 741 entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; 742 return min(entries, ifq->rq_entries); 743 } 744 745 static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, 746 unsigned mask) 747 { 748 unsigned int idx = ifq->cached_rq_head++ & mask; 749 750 return &ifq->rqes[idx]; 751 } 752 753 static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, 754 struct io_zcrx_ifq *ifq, 755 struct net_iov **ret_niov) 756 { 757 unsigned niov_idx, area_idx; 758 struct io_zcrx_area *area; 759 760 area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; 761 niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> ifq->niov_shift; 762 763 if (unlikely(rqe->__pad || area_idx)) 764 return false; 765 area = ifq->area; 766 767 if (unlikely(niov_idx >= area->nia.num_niovs)) 768 return false; 769 niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); 770 771 *ret_niov = &area->nia.niovs[niov_idx]; 772 return true; 773 } 774 775 static void io_zcrx_ring_refill(struct page_pool *pp, 776 struct io_zcrx_ifq *ifq) 777 { 778 unsigned int mask = ifq->rq_entries - 1; 779 unsigned int entries; 780 781 guard(spinlock_bh)(&ifq->rq_lock); 782 783 entries = io_zcrx_rqring_entries(ifq); 784 entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL); 785 if (unlikely(!entries)) 786 return; 787 788 do { 789 struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); 790 struct net_iov *niov; 791 netmem_ref netmem; 792 793 if (!io_parse_rqe(rqe, ifq, &niov)) 794 continue; 795 if (!io_zcrx_put_niov_uref(niov)) 796 continue; 797 798 netmem = net_iov_to_netmem(niov); 799 if (!page_pool_unref_and_test(netmem)) 800 continue; 801 802 if (unlikely(niov->pp != pp)) { 803 io_zcrx_return_niov(niov); 804 continue; 805 } 806 807 io_zcrx_sync_for_device(pp, niov); 808 net_mp_netmem_place_in_cache(pp, netmem); 809 } while (--entries); 810 811 smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); 812 } 813 814 static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) 815 { 816 struct io_zcrx_area *area = ifq->area; 817 818 spin_lock_bh(&area->freelist_lock); 819 while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { 820 struct net_iov *niov = __io_zcrx_get_free_niov(area); 821 netmem_ref netmem = net_iov_to_netmem(niov); 822 823 net_mp_niov_set_page_pool(pp, niov); 824 io_zcrx_sync_for_device(pp, niov); 825 net_mp_netmem_place_in_cache(pp, netmem); 826 } 827 spin_unlock_bh(&area->freelist_lock); 828 } 829 830 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) 831 { 832 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 833 834 /* pp should already be ensuring that */ 835 if (unlikely(pp->alloc.count)) 836 goto out_return; 837 838 io_zcrx_ring_refill(pp, ifq); 839 if (likely(pp->alloc.count)) 840 goto out_return; 841 842 io_zcrx_refill_slow(pp, ifq); 843 if (!pp->alloc.count) 844 return 0; 845 out_return: 846 return pp->alloc.cache[--pp->alloc.count]; 847 } 848 849 static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) 850 { 851 struct net_iov *niov; 852 853 if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 854 return false; 855 856 niov = netmem_to_net_iov(netmem); 857 net_mp_niov_clear_page_pool(niov); 858 io_zcrx_return_niov_freelist(niov); 859 return false; 860 } 861 862 static int io_pp_zc_init(struct page_pool *pp) 863 { 864 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 865 int ret; 866 867 if (WARN_ON_ONCE(!ifq)) 868 return -EINVAL; 869 if (WARN_ON_ONCE(ifq->dev != pp->p.dev)) 870 return -EINVAL; 871 if (WARN_ON_ONCE(!pp->dma_map)) 872 return -EOPNOTSUPP; 873 if (pp->p.order + PAGE_SHIFT != ifq->niov_shift) 874 return -EINVAL; 875 if (pp->p.dma_dir != DMA_FROM_DEVICE) 876 return -EOPNOTSUPP; 877 878 ret = io_zcrx_map_area(ifq, ifq->area); 879 if (ret) 880 return ret; 881 882 percpu_ref_get(&ifq->ctx->refs); 883 return 0; 884 } 885 886 static void io_pp_zc_destroy(struct page_pool *pp) 887 { 888 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 889 890 percpu_ref_put(&ifq->ctx->refs); 891 } 892 893 static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, 894 struct netdev_rx_queue *rxq) 895 { 896 struct nlattr *nest; 897 int type; 898 899 type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; 900 nest = nla_nest_start(rsp, type); 901 if (!nest) 902 return -EMSGSIZE; 903 nla_nest_end(rsp, nest); 904 905 return 0; 906 } 907 908 static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) 909 { 910 struct pp_memory_provider_params *p = &rxq->mp_params; 911 struct io_zcrx_ifq *ifq = mp_priv; 912 913 io_zcrx_drop_netdev(ifq); 914 if (ifq->area) 915 io_zcrx_unmap_area(ifq, ifq->area); 916 917 p->mp_ops = NULL; 918 p->mp_priv = NULL; 919 } 920 921 static const struct memory_provider_ops io_uring_pp_zc_ops = { 922 .alloc_netmems = io_pp_zc_alloc_netmems, 923 .release_netmem = io_pp_zc_release_netmem, 924 .init = io_pp_zc_init, 925 .destroy = io_pp_zc_destroy, 926 .nl_fill = io_pp_nl_fill, 927 .uninstall = io_pp_uninstall, 928 }; 929 930 #define IO_ZCRX_MAX_SYS_REFILL_BUFS (1 << 16) 931 #define IO_ZCRX_SYS_REFILL_BATCH 32 932 933 static void io_return_buffers(struct io_zcrx_ifq *ifq, 934 struct io_uring_zcrx_rqe *rqes, unsigned nr) 935 { 936 int i; 937 938 for (i = 0; i < nr; i++) { 939 struct net_iov *niov; 940 netmem_ref netmem; 941 942 if (!io_parse_rqe(&rqes[i], ifq, &niov)) 943 continue; 944 945 scoped_guard(spinlock_bh, &ifq->rq_lock) { 946 if (!io_zcrx_put_niov_uref(niov)) 947 continue; 948 } 949 950 netmem = net_iov_to_netmem(niov); 951 if (!page_pool_unref_and_test(netmem)) 952 continue; 953 io_zcrx_return_niov(niov); 954 } 955 } 956 957 int io_zcrx_return_bufs(struct io_ring_ctx *ctx, 958 void __user *arg, unsigned nr_arg) 959 { 960 struct io_uring_zcrx_rqe rqes[IO_ZCRX_SYS_REFILL_BATCH]; 961 struct io_uring_zcrx_rqe __user *user_rqes; 962 struct io_uring_zcrx_sync_refill zr; 963 struct io_zcrx_ifq *ifq; 964 unsigned nr, i; 965 966 if (nr_arg) 967 return -EINVAL; 968 if (copy_from_user(&zr, arg, sizeof(zr))) 969 return -EFAULT; 970 if (!zr.nr_entries || zr.nr_entries > IO_ZCRX_MAX_SYS_REFILL_BUFS) 971 return -EINVAL; 972 if (!mem_is_zero(&zr.__resv, sizeof(zr.__resv))) 973 return -EINVAL; 974 975 ifq = xa_load(&ctx->zcrx_ctxs, zr.zcrx_id); 976 if (!ifq) 977 return -EINVAL; 978 nr = zr.nr_entries; 979 user_rqes = u64_to_user_ptr(zr.rqes); 980 981 for (i = 0; i < nr;) { 982 unsigned batch = min(nr - i, IO_ZCRX_SYS_REFILL_BATCH); 983 size_t size = batch * sizeof(rqes[0]); 984 985 if (copy_from_user(rqes, user_rqes + i, size)) 986 return i ? i : -EFAULT; 987 io_return_buffers(ifq, rqes, batch); 988 989 i += batch; 990 991 if (fatal_signal_pending(current)) 992 return i; 993 cond_resched(); 994 } 995 return nr; 996 } 997 998 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 999 struct io_zcrx_ifq *ifq, int off, int len) 1000 { 1001 struct io_ring_ctx *ctx = req->ctx; 1002 struct io_uring_zcrx_cqe *rcqe; 1003 struct io_zcrx_area *area; 1004 struct io_uring_cqe *cqe; 1005 u64 offset; 1006 1007 if (!io_defer_get_uncommited_cqe(ctx, &cqe)) 1008 return false; 1009 1010 cqe->user_data = req->cqe.user_data; 1011 cqe->res = len; 1012 cqe->flags = IORING_CQE_F_MORE; 1013 if (ctx->flags & IORING_SETUP_CQE_MIXED) 1014 cqe->flags |= IORING_CQE_F_32; 1015 1016 area = io_zcrx_iov_to_area(niov); 1017 offset = off + (net_iov_idx(niov) << ifq->niov_shift); 1018 rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); 1019 rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); 1020 rcqe->__pad = 0; 1021 return true; 1022 } 1023 1024 static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq) 1025 { 1026 struct io_zcrx_area *area = ifq->area; 1027 struct net_iov *niov = NULL; 1028 1029 if (area->mem.is_dmabuf) 1030 return NULL; 1031 1032 spin_lock_bh(&area->freelist_lock); 1033 if (area->free_count) 1034 niov = __io_zcrx_get_free_niov(area); 1035 spin_unlock_bh(&area->freelist_lock); 1036 1037 if (niov) 1038 page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); 1039 return niov; 1040 } 1041 1042 struct io_copy_cache { 1043 struct page *page; 1044 unsigned long offset; 1045 size_t size; 1046 }; 1047 1048 static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page, 1049 unsigned int src_offset, size_t len) 1050 { 1051 size_t copied = 0; 1052 1053 len = min(len, cc->size); 1054 1055 while (len) { 1056 void *src_addr, *dst_addr; 1057 struct page *dst_page = cc->page; 1058 unsigned dst_offset = cc->offset; 1059 size_t n = len; 1060 1061 if (folio_test_partial_kmap(page_folio(dst_page)) || 1062 folio_test_partial_kmap(page_folio(src_page))) { 1063 dst_page = nth_page(dst_page, dst_offset / PAGE_SIZE); 1064 dst_offset = offset_in_page(dst_offset); 1065 src_page = nth_page(src_page, src_offset / PAGE_SIZE); 1066 src_offset = offset_in_page(src_offset); 1067 n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset); 1068 n = min(n, len); 1069 } 1070 1071 dst_addr = kmap_local_page(dst_page) + dst_offset; 1072 src_addr = kmap_local_page(src_page) + src_offset; 1073 1074 memcpy(dst_addr, src_addr, n); 1075 1076 kunmap_local(src_addr); 1077 kunmap_local(dst_addr); 1078 1079 cc->size -= n; 1080 cc->offset += n; 1081 len -= n; 1082 copied += n; 1083 } 1084 return copied; 1085 } 1086 1087 static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1088 struct page *src_page, unsigned int src_offset, 1089 size_t len) 1090 { 1091 size_t copied = 0; 1092 int ret = 0; 1093 1094 while (len) { 1095 struct io_copy_cache cc; 1096 struct net_iov *niov; 1097 size_t n; 1098 1099 niov = io_alloc_fallback_niov(ifq); 1100 if (!niov) { 1101 ret = -ENOMEM; 1102 break; 1103 } 1104 1105 cc.page = io_zcrx_iov_page(niov); 1106 cc.offset = 0; 1107 cc.size = PAGE_SIZE; 1108 1109 n = io_copy_page(&cc, src_page, src_offset, len); 1110 1111 if (!io_zcrx_queue_cqe(req, niov, ifq, 0, n)) { 1112 io_zcrx_return_niov(niov); 1113 ret = -ENOSPC; 1114 break; 1115 } 1116 1117 io_zcrx_get_niov_uref(niov); 1118 src_offset += n; 1119 len -= n; 1120 copied += n; 1121 } 1122 1123 return copied ? copied : ret; 1124 } 1125 1126 static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1127 const skb_frag_t *frag, int off, int len) 1128 { 1129 struct page *page = skb_frag_page(frag); 1130 1131 return io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len); 1132 } 1133 1134 static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1135 const skb_frag_t *frag, int off, int len) 1136 { 1137 struct net_iov *niov; 1138 1139 if (unlikely(!skb_frag_is_net_iov(frag))) 1140 return io_zcrx_copy_frag(req, ifq, frag, off, len); 1141 1142 niov = netmem_to_net_iov(frag->netmem); 1143 if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || 1144 io_pp_to_ifq(niov->pp) != ifq) 1145 return -EFAULT; 1146 1147 if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) 1148 return -ENOSPC; 1149 1150 /* 1151 * Prevent it from being recycled while user is accessing it. 1152 * It has to be done before grabbing a user reference. 1153 */ 1154 page_pool_ref_netmem(net_iov_to_netmem(niov)); 1155 io_zcrx_get_niov_uref(niov); 1156 return len; 1157 } 1158 1159 static int 1160 io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, 1161 unsigned int offset, size_t len) 1162 { 1163 struct io_zcrx_args *args = desc->arg.data; 1164 struct io_zcrx_ifq *ifq = args->ifq; 1165 struct io_kiocb *req = args->req; 1166 struct sk_buff *frag_iter; 1167 unsigned start, start_off = offset; 1168 int i, copy, end, off; 1169 int ret = 0; 1170 1171 len = min_t(size_t, len, desc->count); 1172 /* 1173 * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even 1174 * if desc->count is already 0. This is caused by the if (offset + 1 != 1175 * skb->len) check. Return early in this case to break out of 1176 * __tcp_read_sock(). 1177 */ 1178 if (!len) 1179 return 0; 1180 if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) 1181 return -EAGAIN; 1182 1183 if (unlikely(offset < skb_headlen(skb))) { 1184 ssize_t copied; 1185 size_t to_copy; 1186 1187 to_copy = min_t(size_t, skb_headlen(skb) - offset, len); 1188 copied = io_zcrx_copy_chunk(req, ifq, virt_to_page(skb->data), 1189 offset_in_page(skb->data) + offset, 1190 to_copy); 1191 if (copied < 0) { 1192 ret = copied; 1193 goto out; 1194 } 1195 offset += copied; 1196 len -= copied; 1197 if (!len) 1198 goto out; 1199 if (offset != skb_headlen(skb)) 1200 goto out; 1201 } 1202 1203 start = skb_headlen(skb); 1204 1205 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1206 const skb_frag_t *frag; 1207 1208 if (WARN_ON(start > offset + len)) 1209 return -EFAULT; 1210 1211 frag = &skb_shinfo(skb)->frags[i]; 1212 end = start + skb_frag_size(frag); 1213 1214 if (offset < end) { 1215 copy = end - offset; 1216 if (copy > len) 1217 copy = len; 1218 1219 off = offset - start; 1220 ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); 1221 if (ret < 0) 1222 goto out; 1223 1224 offset += ret; 1225 len -= ret; 1226 if (len == 0 || ret != copy) 1227 goto out; 1228 } 1229 start = end; 1230 } 1231 1232 skb_walk_frags(skb, frag_iter) { 1233 if (WARN_ON(start > offset + len)) 1234 return -EFAULT; 1235 1236 end = start + frag_iter->len; 1237 if (offset < end) { 1238 copy = end - offset; 1239 if (copy > len) 1240 copy = len; 1241 1242 off = offset - start; 1243 ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); 1244 if (ret < 0) 1245 goto out; 1246 1247 offset += ret; 1248 len -= ret; 1249 if (len == 0 || ret != copy) 1250 goto out; 1251 } 1252 start = end; 1253 } 1254 1255 out: 1256 if (offset == start_off) 1257 return ret; 1258 desc->count -= (offset - start_off); 1259 return offset - start_off; 1260 } 1261 1262 static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1263 struct sock *sk, int flags, 1264 unsigned issue_flags, unsigned int *outlen) 1265 { 1266 unsigned int len = *outlen; 1267 struct io_zcrx_args args = { 1268 .req = req, 1269 .ifq = ifq, 1270 .sock = sk->sk_socket, 1271 }; 1272 read_descriptor_t rd_desc = { 1273 .count = len ? len : UINT_MAX, 1274 .arg.data = &args, 1275 }; 1276 int ret; 1277 1278 lock_sock(sk); 1279 ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); 1280 if (len && ret > 0) 1281 *outlen = len - ret; 1282 if (ret <= 0) { 1283 if (ret < 0 || sock_flag(sk, SOCK_DONE)) 1284 goto out; 1285 if (sk->sk_err) 1286 ret = sock_error(sk); 1287 else if (sk->sk_shutdown & RCV_SHUTDOWN) 1288 goto out; 1289 else if (sk->sk_state == TCP_CLOSE) 1290 ret = -ENOTCONN; 1291 else 1292 ret = -EAGAIN; 1293 } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) && 1294 (issue_flags & IO_URING_F_MULTISHOT)) { 1295 ret = IOU_REQUEUE; 1296 } else if (sock_flag(sk, SOCK_DONE)) { 1297 /* Make it to retry until it finally gets 0. */ 1298 if (issue_flags & IO_URING_F_MULTISHOT) 1299 ret = IOU_REQUEUE; 1300 else 1301 ret = -EAGAIN; 1302 } 1303 out: 1304 release_sock(sk); 1305 return ret; 1306 } 1307 1308 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1309 struct socket *sock, unsigned int flags, 1310 unsigned issue_flags, unsigned int *len) 1311 { 1312 struct sock *sk = sock->sk; 1313 const struct proto *prot = READ_ONCE(sk->sk_prot); 1314 1315 if (prot->recvmsg != tcp_recvmsg) 1316 return -EPROTONOSUPPORT; 1317 1318 sock_rps_record_flow(sk); 1319 return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); 1320 } 1321