1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/dma-map-ops.h> 5 #include <linux/mm.h> 6 #include <linux/nospec.h> 7 #include <linux/io_uring.h> 8 #include <linux/netdevice.h> 9 #include <linux/rtnetlink.h> 10 #include <linux/skbuff_ref.h> 11 12 #include <net/page_pool/helpers.h> 13 #include <net/page_pool/memory_provider.h> 14 #include <net/netlink.h> 15 #include <net/netdev_rx_queue.h> 16 #include <net/tcp.h> 17 #include <net/rps.h> 18 19 #include <trace/events/page_pool.h> 20 21 #include <uapi/linux/io_uring.h> 22 23 #include "io_uring.h" 24 #include "kbuf.h" 25 #include "memmap.h" 26 #include "zcrx.h" 27 #include "rsrc.h" 28 29 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) 30 31 static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) 32 { 33 return pp->mp_priv; 34 } 35 36 static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) 37 { 38 struct net_iov_area *owner = net_iov_owner(niov); 39 40 return container_of(owner, struct io_zcrx_area, nia); 41 } 42 43 static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) 44 { 45 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 46 47 return area->mem.pages[net_iov_idx(niov)]; 48 } 49 50 static void io_release_dmabuf(struct io_zcrx_mem *mem) 51 { 52 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 53 return; 54 55 if (mem->sgt) 56 dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, 57 DMA_FROM_DEVICE); 58 if (mem->attach) 59 dma_buf_detach(mem->dmabuf, mem->attach); 60 if (mem->dmabuf) 61 dma_buf_put(mem->dmabuf); 62 63 mem->sgt = NULL; 64 mem->attach = NULL; 65 mem->dmabuf = NULL; 66 } 67 68 static int io_import_dmabuf(struct io_zcrx_ifq *ifq, 69 struct io_zcrx_mem *mem, 70 struct io_uring_zcrx_area_reg *area_reg) 71 { 72 unsigned long off = (unsigned long)area_reg->addr; 73 unsigned long len = (unsigned long)area_reg->len; 74 unsigned long total_size = 0; 75 struct scatterlist *sg; 76 int dmabuf_fd = area_reg->dmabuf_fd; 77 int i, ret; 78 79 if (WARN_ON_ONCE(!ifq->dev)) 80 return -EFAULT; 81 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 82 return -EINVAL; 83 84 mem->is_dmabuf = true; 85 mem->dmabuf = dma_buf_get(dmabuf_fd); 86 if (IS_ERR(mem->dmabuf)) { 87 ret = PTR_ERR(mem->dmabuf); 88 mem->dmabuf = NULL; 89 goto err; 90 } 91 92 mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); 93 if (IS_ERR(mem->attach)) { 94 ret = PTR_ERR(mem->attach); 95 mem->attach = NULL; 96 goto err; 97 } 98 99 mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); 100 if (IS_ERR(mem->sgt)) { 101 ret = PTR_ERR(mem->sgt); 102 mem->sgt = NULL; 103 goto err; 104 } 105 106 for_each_sgtable_dma_sg(mem->sgt, sg, i) 107 total_size += sg_dma_len(sg); 108 109 if (total_size < off + len) 110 return -EINVAL; 111 112 mem->dmabuf_offset = off; 113 mem->size = len; 114 return 0; 115 err: 116 io_release_dmabuf(mem); 117 return ret; 118 } 119 120 static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 121 { 122 unsigned long off = area->mem.dmabuf_offset; 123 struct scatterlist *sg; 124 unsigned i, niov_idx = 0; 125 126 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 127 return -EINVAL; 128 129 for_each_sgtable_dma_sg(area->mem.sgt, sg, i) { 130 dma_addr_t dma = sg_dma_address(sg); 131 unsigned long sg_len = sg_dma_len(sg); 132 unsigned long sg_off = min(sg_len, off); 133 134 off -= sg_off; 135 sg_len -= sg_off; 136 dma += sg_off; 137 138 while (sg_len && niov_idx < area->nia.num_niovs) { 139 struct net_iov *niov = &area->nia.niovs[niov_idx]; 140 141 if (net_mp_niov_set_dma_addr(niov, dma)) 142 return 0; 143 sg_len -= PAGE_SIZE; 144 dma += PAGE_SIZE; 145 niov_idx++; 146 } 147 } 148 return niov_idx; 149 } 150 151 static int io_import_umem(struct io_zcrx_ifq *ifq, 152 struct io_zcrx_mem *mem, 153 struct io_uring_zcrx_area_reg *area_reg) 154 { 155 struct page **pages; 156 int nr_pages; 157 158 if (area_reg->dmabuf_fd) 159 return -EINVAL; 160 if (!area_reg->addr) 161 return -EFAULT; 162 pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, 163 &nr_pages); 164 if (IS_ERR(pages)) 165 return PTR_ERR(pages); 166 167 mem->pages = pages; 168 mem->nr_folios = nr_pages; 169 mem->size = area_reg->len; 170 return 0; 171 } 172 173 static void io_release_area_mem(struct io_zcrx_mem *mem) 174 { 175 if (mem->is_dmabuf) { 176 io_release_dmabuf(mem); 177 return; 178 } 179 if (mem->pages) { 180 unpin_user_pages(mem->pages, mem->nr_folios); 181 kvfree(mem->pages); 182 } 183 } 184 185 static int io_import_area(struct io_zcrx_ifq *ifq, 186 struct io_zcrx_mem *mem, 187 struct io_uring_zcrx_area_reg *area_reg) 188 { 189 int ret; 190 191 ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); 192 if (ret) 193 return ret; 194 if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) 195 return -EINVAL; 196 197 if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) 198 return io_import_dmabuf(ifq, mem, area_reg); 199 return io_import_umem(ifq, mem, area_reg); 200 } 201 202 static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq, 203 struct io_zcrx_area *area, int nr_mapped) 204 { 205 int i; 206 207 for (i = 0; i < nr_mapped; i++) { 208 netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]); 209 dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem); 210 211 dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, 212 DMA_FROM_DEVICE, IO_DMA_ATTR); 213 } 214 } 215 216 static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, 217 struct io_zcrx_area *area, int nr_mapped) 218 { 219 int i; 220 221 if (area->mem.is_dmabuf) 222 io_release_dmabuf(&area->mem); 223 else 224 io_zcrx_unmap_umem(ifq, area, nr_mapped); 225 226 for (i = 0; i < area->nia.num_niovs; i++) 227 net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); 228 } 229 230 static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 231 { 232 guard(mutex)(&ifq->dma_lock); 233 234 if (area->is_mapped) 235 __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); 236 area->is_mapped = false; 237 } 238 239 static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 240 { 241 int i; 242 243 for (i = 0; i < area->nia.num_niovs; i++) { 244 struct net_iov *niov = &area->nia.niovs[i]; 245 dma_addr_t dma; 246 247 dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0, 248 PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR); 249 if (dma_mapping_error(ifq->dev, dma)) 250 break; 251 if (net_mp_niov_set_dma_addr(niov, dma)) { 252 dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, 253 DMA_FROM_DEVICE, IO_DMA_ATTR); 254 break; 255 } 256 } 257 return i; 258 } 259 260 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 261 { 262 unsigned nr; 263 264 guard(mutex)(&ifq->dma_lock); 265 if (area->is_mapped) 266 return 0; 267 268 if (area->mem.is_dmabuf) 269 nr = io_zcrx_map_area_dmabuf(ifq, area); 270 else 271 nr = io_zcrx_map_area_umem(ifq, area); 272 273 if (nr != area->nia.num_niovs) { 274 __io_zcrx_unmap_area(ifq, area, nr); 275 return -EINVAL; 276 } 277 278 area->is_mapped = true; 279 return 0; 280 } 281 282 static void io_zcrx_sync_for_device(const struct page_pool *pool, 283 struct net_iov *niov) 284 { 285 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 286 dma_addr_t dma_addr; 287 288 if (!dma_dev_need_sync(pool->p.dev)) 289 return; 290 291 dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); 292 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, 293 PAGE_SIZE, pool->p.dma_dir); 294 #endif 295 } 296 297 #define IO_RQ_MAX_ENTRIES 32768 298 299 #define IO_SKBS_PER_CALL_LIMIT 20 300 301 struct io_zcrx_args { 302 struct io_kiocb *req; 303 struct io_zcrx_ifq *ifq; 304 struct socket *sock; 305 unsigned nr_skbs; 306 }; 307 308 static const struct memory_provider_ops io_uring_pp_zc_ops; 309 310 static inline atomic_t *io_get_user_counter(struct net_iov *niov) 311 { 312 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 313 314 return &area->user_refs[net_iov_idx(niov)]; 315 } 316 317 static bool io_zcrx_put_niov_uref(struct net_iov *niov) 318 { 319 atomic_t *uref = io_get_user_counter(niov); 320 321 if (unlikely(!atomic_read(uref))) 322 return false; 323 atomic_dec(uref); 324 return true; 325 } 326 327 static void io_zcrx_get_niov_uref(struct net_iov *niov) 328 { 329 atomic_inc(io_get_user_counter(niov)); 330 } 331 332 static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, 333 struct io_uring_zcrx_ifq_reg *reg, 334 struct io_uring_region_desc *rd, 335 u32 id) 336 { 337 u64 mmap_offset; 338 size_t off, size; 339 void *ptr; 340 int ret; 341 342 off = sizeof(struct io_uring); 343 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 344 if (size > rd->size) 345 return -EINVAL; 346 347 mmap_offset = IORING_MAP_OFF_ZCRX_REGION; 348 mmap_offset += id << IORING_OFF_PBUF_SHIFT; 349 350 ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); 351 if (ret < 0) 352 return ret; 353 354 ptr = io_region_get_ptr(&ifq->region); 355 ifq->rq_ring = (struct io_uring *)ptr; 356 ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 357 return 0; 358 } 359 360 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) 361 { 362 io_free_region(ifq->ctx, &ifq->region); 363 ifq->rq_ring = NULL; 364 ifq->rqes = NULL; 365 } 366 367 static void io_zcrx_free_area(struct io_zcrx_area *area) 368 { 369 if (area->ifq) 370 io_zcrx_unmap_area(area->ifq, area); 371 io_release_area_mem(&area->mem); 372 373 kvfree(area->freelist); 374 kvfree(area->nia.niovs); 375 kvfree(area->user_refs); 376 kfree(area); 377 } 378 379 #define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) 380 381 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, 382 struct io_zcrx_area **res, 383 struct io_uring_zcrx_area_reg *area_reg) 384 { 385 struct io_zcrx_area *area; 386 unsigned nr_iovs; 387 int i, ret; 388 389 if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) 390 return -EINVAL; 391 if (area_reg->rq_area_token) 392 return -EINVAL; 393 if (area_reg->__resv2[0] || area_reg->__resv2[1]) 394 return -EINVAL; 395 396 ret = -ENOMEM; 397 area = kzalloc(sizeof(*area), GFP_KERNEL); 398 if (!area) 399 goto err; 400 401 ret = io_import_area(ifq, &area->mem, area_reg); 402 if (ret) 403 goto err; 404 405 nr_iovs = area->mem.size >> PAGE_SHIFT; 406 area->nia.num_niovs = nr_iovs; 407 408 ret = -ENOMEM; 409 area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), 410 GFP_KERNEL | __GFP_ZERO); 411 if (!area->nia.niovs) 412 goto err; 413 414 area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), 415 GFP_KERNEL | __GFP_ZERO); 416 if (!area->freelist) 417 goto err; 418 419 area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), 420 GFP_KERNEL | __GFP_ZERO); 421 if (!area->user_refs) 422 goto err; 423 424 for (i = 0; i < nr_iovs; i++) { 425 struct net_iov *niov = &area->nia.niovs[i]; 426 427 niov->owner = &area->nia; 428 area->freelist[i] = i; 429 atomic_set(&area->user_refs[i], 0); 430 niov->type = NET_IOV_IOURING; 431 } 432 433 area->free_count = nr_iovs; 434 area->ifq = ifq; 435 /* we're only supporting one area per ifq for now */ 436 area->area_id = 0; 437 area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; 438 spin_lock_init(&area->freelist_lock); 439 *res = area; 440 return 0; 441 err: 442 if (area) 443 io_zcrx_free_area(area); 444 return ret; 445 } 446 447 static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) 448 { 449 struct io_zcrx_ifq *ifq; 450 451 ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); 452 if (!ifq) 453 return NULL; 454 455 ifq->if_rxq = -1; 456 ifq->ctx = ctx; 457 spin_lock_init(&ifq->lock); 458 spin_lock_init(&ifq->rq_lock); 459 mutex_init(&ifq->dma_lock); 460 return ifq; 461 } 462 463 static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) 464 { 465 spin_lock(&ifq->lock); 466 if (ifq->netdev) { 467 netdev_put(ifq->netdev, &ifq->netdev_tracker); 468 ifq->netdev = NULL; 469 } 470 spin_unlock(&ifq->lock); 471 } 472 473 static void io_close_queue(struct io_zcrx_ifq *ifq) 474 { 475 struct net_device *netdev; 476 netdevice_tracker netdev_tracker; 477 struct pp_memory_provider_params p = { 478 .mp_ops = &io_uring_pp_zc_ops, 479 .mp_priv = ifq, 480 }; 481 482 if (ifq->if_rxq == -1) 483 return; 484 485 spin_lock(&ifq->lock); 486 netdev = ifq->netdev; 487 netdev_tracker = ifq->netdev_tracker; 488 ifq->netdev = NULL; 489 spin_unlock(&ifq->lock); 490 491 if (netdev) { 492 net_mp_close_rxq(netdev, ifq->if_rxq, &p); 493 netdev_put(netdev, &netdev_tracker); 494 } 495 ifq->if_rxq = -1; 496 } 497 498 static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) 499 { 500 io_close_queue(ifq); 501 io_zcrx_drop_netdev(ifq); 502 503 if (ifq->area) 504 io_zcrx_free_area(ifq->area); 505 if (ifq->dev) 506 put_device(ifq->dev); 507 508 io_free_rbuf_ring(ifq); 509 mutex_destroy(&ifq->dma_lock); 510 kfree(ifq); 511 } 512 513 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, 514 unsigned int id) 515 { 516 struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); 517 518 lockdep_assert_held(&ctx->mmap_lock); 519 520 return ifq ? &ifq->region : NULL; 521 } 522 523 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 524 struct io_uring_zcrx_ifq_reg __user *arg) 525 { 526 struct pp_memory_provider_params mp_param = {}; 527 struct io_uring_zcrx_area_reg area; 528 struct io_uring_zcrx_ifq_reg reg; 529 struct io_uring_region_desc rd; 530 struct io_zcrx_ifq *ifq; 531 int ret; 532 u32 id; 533 534 /* 535 * 1. Interface queue allocation. 536 * 2. It can observe data destined for sockets of other tasks. 537 */ 538 if (!capable(CAP_NET_ADMIN)) 539 return -EPERM; 540 541 /* mandatory io_uring features for zc rx */ 542 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && 543 ctx->flags & IORING_SETUP_CQE32)) 544 return -EINVAL; 545 if (copy_from_user(®, arg, sizeof(reg))) 546 return -EFAULT; 547 if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 548 return -EFAULT; 549 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)) || 550 reg.__resv2 || reg.zcrx_id) 551 return -EINVAL; 552 if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) 553 return -EINVAL; 554 if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { 555 if (!(ctx->flags & IORING_SETUP_CLAMP)) 556 return -EINVAL; 557 reg.rq_entries = IO_RQ_MAX_ENTRIES; 558 } 559 reg.rq_entries = roundup_pow_of_two(reg.rq_entries); 560 561 if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) 562 return -EFAULT; 563 564 ifq = io_zcrx_ifq_alloc(ctx); 565 if (!ifq) 566 return -ENOMEM; 567 ifq->rq_entries = reg.rq_entries; 568 569 scoped_guard(mutex, &ctx->mmap_lock) { 570 /* preallocate id */ 571 ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); 572 if (ret) 573 goto ifq_free; 574 } 575 576 ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); 577 if (ret) 578 goto err; 579 580 ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, 581 &ifq->netdev_tracker, GFP_KERNEL); 582 if (!ifq->netdev) { 583 ret = -ENODEV; 584 goto err; 585 } 586 587 ifq->dev = ifq->netdev->dev.parent; 588 if (!ifq->dev) { 589 ret = -EOPNOTSUPP; 590 goto err; 591 } 592 get_device(ifq->dev); 593 594 ret = io_zcrx_create_area(ifq, &ifq->area, &area); 595 if (ret) 596 goto err; 597 598 mp_param.mp_ops = &io_uring_pp_zc_ops; 599 mp_param.mp_priv = ifq; 600 ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); 601 if (ret) 602 goto err; 603 ifq->if_rxq = reg.if_rxq; 604 605 reg.offsets.rqes = sizeof(struct io_uring); 606 reg.offsets.head = offsetof(struct io_uring, head); 607 reg.offsets.tail = offsetof(struct io_uring, tail); 608 reg.zcrx_id = id; 609 610 scoped_guard(mutex, &ctx->mmap_lock) { 611 /* publish ifq */ 612 ret = -ENOMEM; 613 if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) 614 goto err; 615 } 616 617 if (copy_to_user(arg, ®, sizeof(reg)) || 618 copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || 619 copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { 620 ret = -EFAULT; 621 goto err; 622 } 623 return 0; 624 err: 625 scoped_guard(mutex, &ctx->mmap_lock) 626 xa_erase(&ctx->zcrx_ctxs, id); 627 ifq_free: 628 io_zcrx_ifq_free(ifq); 629 return ret; 630 } 631 632 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) 633 { 634 struct io_zcrx_ifq *ifq; 635 636 lockdep_assert_held(&ctx->uring_lock); 637 638 while (1) { 639 scoped_guard(mutex, &ctx->mmap_lock) { 640 unsigned long id = 0; 641 642 ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); 643 if (ifq) 644 xa_erase(&ctx->zcrx_ctxs, id); 645 } 646 if (!ifq) 647 break; 648 io_zcrx_ifq_free(ifq); 649 } 650 651 xa_destroy(&ctx->zcrx_ctxs); 652 } 653 654 static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) 655 { 656 unsigned niov_idx; 657 658 lockdep_assert_held(&area->freelist_lock); 659 660 niov_idx = area->freelist[--area->free_count]; 661 return &area->nia.niovs[niov_idx]; 662 } 663 664 static void io_zcrx_return_niov_freelist(struct net_iov *niov) 665 { 666 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 667 668 spin_lock_bh(&area->freelist_lock); 669 area->freelist[area->free_count++] = net_iov_idx(niov); 670 spin_unlock_bh(&area->freelist_lock); 671 } 672 673 static void io_zcrx_return_niov(struct net_iov *niov) 674 { 675 netmem_ref netmem = net_iov_to_netmem(niov); 676 677 if (!niov->pp) { 678 /* copy fallback allocated niovs */ 679 io_zcrx_return_niov_freelist(niov); 680 return; 681 } 682 page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); 683 } 684 685 static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 686 { 687 struct io_zcrx_area *area = ifq->area; 688 int i; 689 690 if (!area) 691 return; 692 693 /* Reclaim back all buffers given to the user space. */ 694 for (i = 0; i < area->nia.num_niovs; i++) { 695 struct net_iov *niov = &area->nia.niovs[i]; 696 int nr; 697 698 if (!atomic_read(io_get_user_counter(niov))) 699 continue; 700 nr = atomic_xchg(io_get_user_counter(niov), 0); 701 if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 702 io_zcrx_return_niov(niov); 703 } 704 } 705 706 void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) 707 { 708 struct io_zcrx_ifq *ifq; 709 unsigned long index; 710 711 lockdep_assert_held(&ctx->uring_lock); 712 713 xa_for_each(&ctx->zcrx_ctxs, index, ifq) { 714 io_zcrx_scrub(ifq); 715 io_close_queue(ifq); 716 } 717 } 718 719 static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) 720 { 721 u32 entries; 722 723 entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; 724 return min(entries, ifq->rq_entries); 725 } 726 727 static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, 728 unsigned mask) 729 { 730 unsigned int idx = ifq->cached_rq_head++ & mask; 731 732 return &ifq->rqes[idx]; 733 } 734 735 static void io_zcrx_ring_refill(struct page_pool *pp, 736 struct io_zcrx_ifq *ifq) 737 { 738 unsigned int mask = ifq->rq_entries - 1; 739 unsigned int entries; 740 netmem_ref netmem; 741 742 spin_lock_bh(&ifq->rq_lock); 743 744 entries = io_zcrx_rqring_entries(ifq); 745 entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); 746 if (unlikely(!entries)) { 747 spin_unlock_bh(&ifq->rq_lock); 748 return; 749 } 750 751 do { 752 struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); 753 struct io_zcrx_area *area; 754 struct net_iov *niov; 755 unsigned niov_idx, area_idx; 756 757 area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; 758 niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; 759 760 if (unlikely(rqe->__pad || area_idx)) 761 continue; 762 area = ifq->area; 763 764 if (unlikely(niov_idx >= area->nia.num_niovs)) 765 continue; 766 niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); 767 768 niov = &area->nia.niovs[niov_idx]; 769 if (!io_zcrx_put_niov_uref(niov)) 770 continue; 771 772 netmem = net_iov_to_netmem(niov); 773 if (page_pool_unref_netmem(netmem, 1) != 0) 774 continue; 775 776 if (unlikely(niov->pp != pp)) { 777 io_zcrx_return_niov(niov); 778 continue; 779 } 780 781 io_zcrx_sync_for_device(pp, niov); 782 net_mp_netmem_place_in_cache(pp, netmem); 783 } while (--entries); 784 785 smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); 786 spin_unlock_bh(&ifq->rq_lock); 787 } 788 789 static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) 790 { 791 struct io_zcrx_area *area = ifq->area; 792 793 spin_lock_bh(&area->freelist_lock); 794 while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { 795 struct net_iov *niov = __io_zcrx_get_free_niov(area); 796 netmem_ref netmem = net_iov_to_netmem(niov); 797 798 net_mp_niov_set_page_pool(pp, niov); 799 io_zcrx_sync_for_device(pp, niov); 800 net_mp_netmem_place_in_cache(pp, netmem); 801 } 802 spin_unlock_bh(&area->freelist_lock); 803 } 804 805 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) 806 { 807 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 808 809 /* pp should already be ensuring that */ 810 if (unlikely(pp->alloc.count)) 811 goto out_return; 812 813 io_zcrx_ring_refill(pp, ifq); 814 if (likely(pp->alloc.count)) 815 goto out_return; 816 817 io_zcrx_refill_slow(pp, ifq); 818 if (!pp->alloc.count) 819 return 0; 820 out_return: 821 return pp->alloc.cache[--pp->alloc.count]; 822 } 823 824 static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) 825 { 826 struct net_iov *niov; 827 828 if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 829 return false; 830 831 niov = netmem_to_net_iov(netmem); 832 net_mp_niov_clear_page_pool(niov); 833 io_zcrx_return_niov_freelist(niov); 834 return false; 835 } 836 837 static int io_pp_zc_init(struct page_pool *pp) 838 { 839 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 840 int ret; 841 842 if (WARN_ON_ONCE(!ifq)) 843 return -EINVAL; 844 if (WARN_ON_ONCE(ifq->dev != pp->p.dev)) 845 return -EINVAL; 846 if (WARN_ON_ONCE(!pp->dma_map)) 847 return -EOPNOTSUPP; 848 if (pp->p.order != 0) 849 return -EOPNOTSUPP; 850 if (pp->p.dma_dir != DMA_FROM_DEVICE) 851 return -EOPNOTSUPP; 852 853 ret = io_zcrx_map_area(ifq, ifq->area); 854 if (ret) 855 return ret; 856 857 percpu_ref_get(&ifq->ctx->refs); 858 return 0; 859 } 860 861 static void io_pp_zc_destroy(struct page_pool *pp) 862 { 863 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 864 struct io_zcrx_area *area = ifq->area; 865 866 if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) 867 return; 868 percpu_ref_put(&ifq->ctx->refs); 869 } 870 871 static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, 872 struct netdev_rx_queue *rxq) 873 { 874 struct nlattr *nest; 875 int type; 876 877 type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; 878 nest = nla_nest_start(rsp, type); 879 if (!nest) 880 return -EMSGSIZE; 881 nla_nest_end(rsp, nest); 882 883 return 0; 884 } 885 886 static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) 887 { 888 struct pp_memory_provider_params *p = &rxq->mp_params; 889 struct io_zcrx_ifq *ifq = mp_priv; 890 891 io_zcrx_drop_netdev(ifq); 892 if (ifq->area) 893 io_zcrx_unmap_area(ifq, ifq->area); 894 895 p->mp_ops = NULL; 896 p->mp_priv = NULL; 897 } 898 899 static const struct memory_provider_ops io_uring_pp_zc_ops = { 900 .alloc_netmems = io_pp_zc_alloc_netmems, 901 .release_netmem = io_pp_zc_release_netmem, 902 .init = io_pp_zc_init, 903 .destroy = io_pp_zc_destroy, 904 .nl_fill = io_pp_nl_fill, 905 .uninstall = io_pp_uninstall, 906 }; 907 908 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 909 struct io_zcrx_ifq *ifq, int off, int len) 910 { 911 struct io_uring_zcrx_cqe *rcqe; 912 struct io_zcrx_area *area; 913 struct io_uring_cqe *cqe; 914 u64 offset; 915 916 if (!io_defer_get_uncommited_cqe(req->ctx, &cqe)) 917 return false; 918 919 cqe->user_data = req->cqe.user_data; 920 cqe->res = len; 921 cqe->flags = IORING_CQE_F_MORE; 922 923 area = io_zcrx_iov_to_area(niov); 924 offset = off + (net_iov_idx(niov) << PAGE_SHIFT); 925 rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); 926 rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); 927 rcqe->__pad = 0; 928 return true; 929 } 930 931 static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) 932 { 933 struct net_iov *niov = NULL; 934 935 spin_lock_bh(&area->freelist_lock); 936 if (area->free_count) 937 niov = __io_zcrx_get_free_niov(area); 938 spin_unlock_bh(&area->freelist_lock); 939 940 if (niov) 941 page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); 942 return niov; 943 } 944 945 static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 946 void *src_base, struct page *src_page, 947 unsigned int src_offset, size_t len) 948 { 949 struct io_zcrx_area *area = ifq->area; 950 size_t copied = 0; 951 int ret = 0; 952 953 if (area->mem.is_dmabuf) 954 return -EFAULT; 955 956 while (len) { 957 size_t copy_size = min_t(size_t, PAGE_SIZE, len); 958 const int dst_off = 0; 959 struct net_iov *niov; 960 struct page *dst_page; 961 void *dst_addr; 962 963 niov = io_zcrx_alloc_fallback(area); 964 if (!niov) { 965 ret = -ENOMEM; 966 break; 967 } 968 969 dst_page = io_zcrx_iov_page(niov); 970 dst_addr = kmap_local_page(dst_page); 971 if (src_page) 972 src_base = kmap_local_page(src_page); 973 974 memcpy(dst_addr, src_base + src_offset, copy_size); 975 976 if (src_page) 977 kunmap_local(src_base); 978 kunmap_local(dst_addr); 979 980 if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) { 981 io_zcrx_return_niov(niov); 982 ret = -ENOSPC; 983 break; 984 } 985 986 io_zcrx_get_niov_uref(niov); 987 src_offset += copy_size; 988 len -= copy_size; 989 copied += copy_size; 990 } 991 992 return copied ? copied : ret; 993 } 994 995 static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 996 const skb_frag_t *frag, int off, int len) 997 { 998 struct page *page = skb_frag_page(frag); 999 u32 p_off, p_len, t, copied = 0; 1000 int ret = 0; 1001 1002 off += skb_frag_off(frag); 1003 1004 skb_frag_foreach_page(frag, off, len, 1005 page, p_off, p_len, t) { 1006 ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len); 1007 if (ret < 0) 1008 return copied ? copied : ret; 1009 copied += ret; 1010 } 1011 return copied; 1012 } 1013 1014 static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1015 const skb_frag_t *frag, int off, int len) 1016 { 1017 struct net_iov *niov; 1018 1019 if (unlikely(!skb_frag_is_net_iov(frag))) 1020 return io_zcrx_copy_frag(req, ifq, frag, off, len); 1021 1022 niov = netmem_to_net_iov(frag->netmem); 1023 if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || 1024 io_pp_to_ifq(niov->pp) != ifq) 1025 return -EFAULT; 1026 1027 if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) 1028 return -ENOSPC; 1029 1030 /* 1031 * Prevent it from being recycled while user is accessing it. 1032 * It has to be done before grabbing a user reference. 1033 */ 1034 page_pool_ref_netmem(net_iov_to_netmem(niov)); 1035 io_zcrx_get_niov_uref(niov); 1036 return len; 1037 } 1038 1039 static int 1040 io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, 1041 unsigned int offset, size_t len) 1042 { 1043 struct io_zcrx_args *args = desc->arg.data; 1044 struct io_zcrx_ifq *ifq = args->ifq; 1045 struct io_kiocb *req = args->req; 1046 struct sk_buff *frag_iter; 1047 unsigned start, start_off = offset; 1048 int i, copy, end, off; 1049 int ret = 0; 1050 1051 len = min_t(size_t, len, desc->count); 1052 /* 1053 * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even 1054 * if desc->count is already 0. This is caused by the if (offset + 1 != 1055 * skb->len) check. Return early in this case to break out of 1056 * __tcp_read_sock(). 1057 */ 1058 if (!len) 1059 return 0; 1060 if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) 1061 return -EAGAIN; 1062 1063 if (unlikely(offset < skb_headlen(skb))) { 1064 ssize_t copied; 1065 size_t to_copy; 1066 1067 to_copy = min_t(size_t, skb_headlen(skb) - offset, len); 1068 copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL, 1069 offset, to_copy); 1070 if (copied < 0) { 1071 ret = copied; 1072 goto out; 1073 } 1074 offset += copied; 1075 len -= copied; 1076 if (!len) 1077 goto out; 1078 if (offset != skb_headlen(skb)) 1079 goto out; 1080 } 1081 1082 start = skb_headlen(skb); 1083 1084 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1085 const skb_frag_t *frag; 1086 1087 if (WARN_ON(start > offset + len)) 1088 return -EFAULT; 1089 1090 frag = &skb_shinfo(skb)->frags[i]; 1091 end = start + skb_frag_size(frag); 1092 1093 if (offset < end) { 1094 copy = end - offset; 1095 if (copy > len) 1096 copy = len; 1097 1098 off = offset - start; 1099 ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); 1100 if (ret < 0) 1101 goto out; 1102 1103 offset += ret; 1104 len -= ret; 1105 if (len == 0 || ret != copy) 1106 goto out; 1107 } 1108 start = end; 1109 } 1110 1111 skb_walk_frags(skb, frag_iter) { 1112 if (WARN_ON(start > offset + len)) 1113 return -EFAULT; 1114 1115 end = start + frag_iter->len; 1116 if (offset < end) { 1117 copy = end - offset; 1118 if (copy > len) 1119 copy = len; 1120 1121 off = offset - start; 1122 ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); 1123 if (ret < 0) 1124 goto out; 1125 1126 offset += ret; 1127 len -= ret; 1128 if (len == 0 || ret != copy) 1129 goto out; 1130 } 1131 start = end; 1132 } 1133 1134 out: 1135 if (offset == start_off) 1136 return ret; 1137 desc->count -= (offset - start_off); 1138 return offset - start_off; 1139 } 1140 1141 static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1142 struct sock *sk, int flags, 1143 unsigned issue_flags, unsigned int *outlen) 1144 { 1145 unsigned int len = *outlen; 1146 struct io_zcrx_args args = { 1147 .req = req, 1148 .ifq = ifq, 1149 .sock = sk->sk_socket, 1150 }; 1151 read_descriptor_t rd_desc = { 1152 .count = len ? len : UINT_MAX, 1153 .arg.data = &args, 1154 }; 1155 int ret; 1156 1157 lock_sock(sk); 1158 ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); 1159 if (len && ret > 0) 1160 *outlen = len - ret; 1161 if (ret <= 0) { 1162 if (ret < 0 || sock_flag(sk, SOCK_DONE)) 1163 goto out; 1164 if (sk->sk_err) 1165 ret = sock_error(sk); 1166 else if (sk->sk_shutdown & RCV_SHUTDOWN) 1167 goto out; 1168 else if (sk->sk_state == TCP_CLOSE) 1169 ret = -ENOTCONN; 1170 else 1171 ret = -EAGAIN; 1172 } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) && 1173 (issue_flags & IO_URING_F_MULTISHOT)) { 1174 ret = IOU_REQUEUE; 1175 } else if (sock_flag(sk, SOCK_DONE)) { 1176 /* Make it to retry until it finally gets 0. */ 1177 if (issue_flags & IO_URING_F_MULTISHOT) 1178 ret = IOU_REQUEUE; 1179 else 1180 ret = -EAGAIN; 1181 } 1182 out: 1183 release_sock(sk); 1184 return ret; 1185 } 1186 1187 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1188 struct socket *sock, unsigned int flags, 1189 unsigned issue_flags, unsigned int *len) 1190 { 1191 struct sock *sk = sock->sk; 1192 const struct proto *prot = READ_ONCE(sk->sk_prot); 1193 1194 if (prot->recvmsg != tcp_recvmsg) 1195 return -EPROTONOSUPPORT; 1196 1197 sock_rps_record_flow(sk); 1198 return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); 1199 } 1200