1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/dma-map-ops.h> 5 #include <linux/mm.h> 6 #include <linux/nospec.h> 7 #include <linux/io_uring.h> 8 #include <linux/netdevice.h> 9 #include <linux/rtnetlink.h> 10 #include <linux/skbuff_ref.h> 11 12 #include <net/page_pool/helpers.h> 13 #include <net/page_pool/memory_provider.h> 14 #include <net/netlink.h> 15 #include <net/netdev_rx_queue.h> 16 #include <net/tcp.h> 17 #include <net/rps.h> 18 19 #include <trace/events/page_pool.h> 20 21 #include <uapi/linux/io_uring.h> 22 23 #include "io_uring.h" 24 #include "kbuf.h" 25 #include "memmap.h" 26 #include "zcrx.h" 27 #include "rsrc.h" 28 29 static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) 30 { 31 return pp->mp_priv; 32 } 33 34 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) 35 36 static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, 37 struct io_zcrx_area *area, int nr_mapped) 38 { 39 int i; 40 41 for (i = 0; i < nr_mapped; i++) { 42 struct net_iov *niov = &area->nia.niovs[i]; 43 dma_addr_t dma; 44 45 dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); 46 dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, 47 DMA_FROM_DEVICE, IO_DMA_ATTR); 48 net_mp_niov_set_dma_addr(niov, 0); 49 } 50 } 51 52 static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 53 { 54 guard(mutex)(&ifq->dma_lock); 55 56 if (area->is_mapped) 57 __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); 58 area->is_mapped = false; 59 } 60 61 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 62 { 63 int i; 64 65 guard(mutex)(&ifq->dma_lock); 66 if (area->is_mapped) 67 return 0; 68 69 for (i = 0; i < area->nia.num_niovs; i++) { 70 struct net_iov *niov = &area->nia.niovs[i]; 71 dma_addr_t dma; 72 73 dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE, 74 DMA_FROM_DEVICE, IO_DMA_ATTR); 75 if (dma_mapping_error(ifq->dev, dma)) 76 break; 77 if (net_mp_niov_set_dma_addr(niov, dma)) { 78 dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, 79 DMA_FROM_DEVICE, IO_DMA_ATTR); 80 break; 81 } 82 } 83 84 if (i != area->nia.num_niovs) { 85 __io_zcrx_unmap_area(ifq, area, i); 86 return -EINVAL; 87 } 88 89 area->is_mapped = true; 90 return 0; 91 } 92 93 static void io_zcrx_sync_for_device(const struct page_pool *pool, 94 struct net_iov *niov) 95 { 96 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 97 dma_addr_t dma_addr; 98 99 if (!dma_dev_need_sync(pool->p.dev)) 100 return; 101 102 dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); 103 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, 104 PAGE_SIZE, pool->p.dma_dir); 105 #endif 106 } 107 108 #define IO_RQ_MAX_ENTRIES 32768 109 110 #define IO_SKBS_PER_CALL_LIMIT 20 111 112 struct io_zcrx_args { 113 struct io_kiocb *req; 114 struct io_zcrx_ifq *ifq; 115 struct socket *sock; 116 unsigned nr_skbs; 117 }; 118 119 static const struct memory_provider_ops io_uring_pp_zc_ops; 120 121 static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) 122 { 123 struct net_iov_area *owner = net_iov_owner(niov); 124 125 return container_of(owner, struct io_zcrx_area, nia); 126 } 127 128 static inline atomic_t *io_get_user_counter(struct net_iov *niov) 129 { 130 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 131 132 return &area->user_refs[net_iov_idx(niov)]; 133 } 134 135 static bool io_zcrx_put_niov_uref(struct net_iov *niov) 136 { 137 atomic_t *uref = io_get_user_counter(niov); 138 139 if (unlikely(!atomic_read(uref))) 140 return false; 141 atomic_dec(uref); 142 return true; 143 } 144 145 static void io_zcrx_get_niov_uref(struct net_iov *niov) 146 { 147 atomic_inc(io_get_user_counter(niov)); 148 } 149 150 static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) 151 { 152 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 153 154 return area->pages[net_iov_idx(niov)]; 155 } 156 157 static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, 158 struct io_uring_zcrx_ifq_reg *reg, 159 struct io_uring_region_desc *rd) 160 { 161 size_t off, size; 162 void *ptr; 163 int ret; 164 165 off = sizeof(struct io_uring); 166 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 167 if (size > rd->size) 168 return -EINVAL; 169 170 ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd, 171 IORING_MAP_OFF_ZCRX_REGION); 172 if (ret < 0) 173 return ret; 174 175 ptr = io_region_get_ptr(&ifq->ctx->zcrx_region); 176 ifq->rq_ring = (struct io_uring *)ptr; 177 ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 178 return 0; 179 } 180 181 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) 182 { 183 io_free_region(ifq->ctx, &ifq->ctx->zcrx_region); 184 ifq->rq_ring = NULL; 185 ifq->rqes = NULL; 186 } 187 188 static void io_zcrx_free_area(struct io_zcrx_area *area) 189 { 190 io_zcrx_unmap_area(area->ifq, area); 191 192 kvfree(area->freelist); 193 kvfree(area->nia.niovs); 194 kvfree(area->user_refs); 195 if (area->pages) { 196 unpin_user_pages(area->pages, area->nr_folios); 197 kvfree(area->pages); 198 } 199 kfree(area); 200 } 201 202 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, 203 struct io_zcrx_area **res, 204 struct io_uring_zcrx_area_reg *area_reg) 205 { 206 struct io_zcrx_area *area; 207 int i, ret, nr_pages, nr_iovs; 208 struct iovec iov; 209 210 if (area_reg->flags || area_reg->rq_area_token) 211 return -EINVAL; 212 if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) 213 return -EINVAL; 214 if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) 215 return -EINVAL; 216 217 iov.iov_base = u64_to_user_ptr(area_reg->addr); 218 iov.iov_len = area_reg->len; 219 ret = io_buffer_validate(&iov); 220 if (ret) 221 return ret; 222 223 ret = -ENOMEM; 224 area = kzalloc(sizeof(*area), GFP_KERNEL); 225 if (!area) 226 goto err; 227 228 area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, 229 &nr_pages); 230 if (IS_ERR(area->pages)) { 231 ret = PTR_ERR(area->pages); 232 area->pages = NULL; 233 goto err; 234 } 235 area->nr_folios = nr_iovs = nr_pages; 236 area->nia.num_niovs = nr_iovs; 237 238 area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), 239 GFP_KERNEL | __GFP_ZERO); 240 if (!area->nia.niovs) 241 goto err; 242 243 area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), 244 GFP_KERNEL | __GFP_ZERO); 245 if (!area->freelist) 246 goto err; 247 248 for (i = 0; i < nr_iovs; i++) 249 area->freelist[i] = i; 250 251 area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), 252 GFP_KERNEL | __GFP_ZERO); 253 if (!area->user_refs) 254 goto err; 255 256 for (i = 0; i < nr_iovs; i++) { 257 struct net_iov *niov = &area->nia.niovs[i]; 258 259 niov->owner = &area->nia; 260 area->freelist[i] = i; 261 atomic_set(&area->user_refs[i], 0); 262 } 263 264 area->free_count = nr_iovs; 265 area->ifq = ifq; 266 /* we're only supporting one area per ifq for now */ 267 area->area_id = 0; 268 area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; 269 spin_lock_init(&area->freelist_lock); 270 *res = area; 271 return 0; 272 err: 273 if (area) 274 io_zcrx_free_area(area); 275 return ret; 276 } 277 278 static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) 279 { 280 struct io_zcrx_ifq *ifq; 281 282 ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); 283 if (!ifq) 284 return NULL; 285 286 ifq->if_rxq = -1; 287 ifq->ctx = ctx; 288 spin_lock_init(&ifq->lock); 289 spin_lock_init(&ifq->rq_lock); 290 mutex_init(&ifq->dma_lock); 291 return ifq; 292 } 293 294 static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) 295 { 296 spin_lock(&ifq->lock); 297 if (ifq->netdev) { 298 netdev_put(ifq->netdev, &ifq->netdev_tracker); 299 ifq->netdev = NULL; 300 } 301 spin_unlock(&ifq->lock); 302 } 303 304 static void io_close_queue(struct io_zcrx_ifq *ifq) 305 { 306 struct net_device *netdev; 307 netdevice_tracker netdev_tracker; 308 struct pp_memory_provider_params p = { 309 .mp_ops = &io_uring_pp_zc_ops, 310 .mp_priv = ifq, 311 }; 312 313 if (ifq->if_rxq == -1) 314 return; 315 316 spin_lock(&ifq->lock); 317 netdev = ifq->netdev; 318 netdev_tracker = ifq->netdev_tracker; 319 ifq->netdev = NULL; 320 spin_unlock(&ifq->lock); 321 322 if (netdev) { 323 net_mp_close_rxq(netdev, ifq->if_rxq, &p); 324 netdev_put(netdev, &netdev_tracker); 325 } 326 ifq->if_rxq = -1; 327 } 328 329 static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) 330 { 331 io_close_queue(ifq); 332 io_zcrx_drop_netdev(ifq); 333 334 if (ifq->area) 335 io_zcrx_free_area(ifq->area); 336 if (ifq->dev) 337 put_device(ifq->dev); 338 339 io_free_rbuf_ring(ifq); 340 mutex_destroy(&ifq->dma_lock); 341 kfree(ifq); 342 } 343 344 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 345 struct io_uring_zcrx_ifq_reg __user *arg) 346 { 347 struct pp_memory_provider_params mp_param = {}; 348 struct io_uring_zcrx_area_reg area; 349 struct io_uring_zcrx_ifq_reg reg; 350 struct io_uring_region_desc rd; 351 struct io_zcrx_ifq *ifq; 352 int ret; 353 354 /* 355 * 1. Interface queue allocation. 356 * 2. It can observe data destined for sockets of other tasks. 357 */ 358 if (!capable(CAP_NET_ADMIN)) 359 return -EPERM; 360 361 /* mandatory io_uring features for zc rx */ 362 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && 363 ctx->flags & IORING_SETUP_CQE32)) 364 return -EINVAL; 365 if (ctx->ifq) 366 return -EBUSY; 367 if (copy_from_user(®, arg, sizeof(reg))) 368 return -EFAULT; 369 if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 370 return -EFAULT; 371 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)) || 372 reg.__resv2 || reg.zcrx_id) 373 return -EINVAL; 374 if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) 375 return -EINVAL; 376 if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { 377 if (!(ctx->flags & IORING_SETUP_CLAMP)) 378 return -EINVAL; 379 reg.rq_entries = IO_RQ_MAX_ENTRIES; 380 } 381 reg.rq_entries = roundup_pow_of_two(reg.rq_entries); 382 383 if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) 384 return -EFAULT; 385 386 ifq = io_zcrx_ifq_alloc(ctx); 387 if (!ifq) 388 return -ENOMEM; 389 390 ret = io_allocate_rbuf_ring(ifq, ®, &rd); 391 if (ret) 392 goto err; 393 394 ret = io_zcrx_create_area(ifq, &ifq->area, &area); 395 if (ret) 396 goto err; 397 398 ifq->rq_entries = reg.rq_entries; 399 400 ret = -ENODEV; 401 ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, 402 &ifq->netdev_tracker, GFP_KERNEL); 403 if (!ifq->netdev) 404 goto err; 405 406 ifq->dev = ifq->netdev->dev.parent; 407 ret = -EOPNOTSUPP; 408 if (!ifq->dev) 409 goto err; 410 get_device(ifq->dev); 411 412 mp_param.mp_ops = &io_uring_pp_zc_ops; 413 mp_param.mp_priv = ifq; 414 ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); 415 if (ret) 416 goto err; 417 ifq->if_rxq = reg.if_rxq; 418 419 reg.offsets.rqes = sizeof(struct io_uring); 420 reg.offsets.head = offsetof(struct io_uring, head); 421 reg.offsets.tail = offsetof(struct io_uring, tail); 422 423 if (copy_to_user(arg, ®, sizeof(reg)) || 424 copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || 425 copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { 426 ret = -EFAULT; 427 goto err; 428 } 429 ctx->ifq = ifq; 430 return 0; 431 err: 432 io_zcrx_ifq_free(ifq); 433 return ret; 434 } 435 436 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) 437 { 438 struct io_zcrx_ifq *ifq = ctx->ifq; 439 440 lockdep_assert_held(&ctx->uring_lock); 441 442 if (!ifq) 443 return; 444 445 ctx->ifq = NULL; 446 io_zcrx_ifq_free(ifq); 447 } 448 449 static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) 450 { 451 unsigned niov_idx; 452 453 lockdep_assert_held(&area->freelist_lock); 454 455 niov_idx = area->freelist[--area->free_count]; 456 return &area->nia.niovs[niov_idx]; 457 } 458 459 static void io_zcrx_return_niov_freelist(struct net_iov *niov) 460 { 461 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 462 463 spin_lock_bh(&area->freelist_lock); 464 area->freelist[area->free_count++] = net_iov_idx(niov); 465 spin_unlock_bh(&area->freelist_lock); 466 } 467 468 static void io_zcrx_return_niov(struct net_iov *niov) 469 { 470 netmem_ref netmem = net_iov_to_netmem(niov); 471 472 if (!niov->pp) { 473 /* copy fallback allocated niovs */ 474 io_zcrx_return_niov_freelist(niov); 475 return; 476 } 477 page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); 478 } 479 480 static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 481 { 482 struct io_zcrx_area *area = ifq->area; 483 int i; 484 485 if (!area) 486 return; 487 488 /* Reclaim back all buffers given to the user space. */ 489 for (i = 0; i < area->nia.num_niovs; i++) { 490 struct net_iov *niov = &area->nia.niovs[i]; 491 int nr; 492 493 if (!atomic_read(io_get_user_counter(niov))) 494 continue; 495 nr = atomic_xchg(io_get_user_counter(niov), 0); 496 if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 497 io_zcrx_return_niov(niov); 498 } 499 } 500 501 void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) 502 { 503 lockdep_assert_held(&ctx->uring_lock); 504 505 if (!ctx->ifq) 506 return; 507 io_zcrx_scrub(ctx->ifq); 508 io_close_queue(ctx->ifq); 509 } 510 511 static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) 512 { 513 u32 entries; 514 515 entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; 516 return min(entries, ifq->rq_entries); 517 } 518 519 static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, 520 unsigned mask) 521 { 522 unsigned int idx = ifq->cached_rq_head++ & mask; 523 524 return &ifq->rqes[idx]; 525 } 526 527 static void io_zcrx_ring_refill(struct page_pool *pp, 528 struct io_zcrx_ifq *ifq) 529 { 530 unsigned int mask = ifq->rq_entries - 1; 531 unsigned int entries; 532 netmem_ref netmem; 533 534 spin_lock_bh(&ifq->rq_lock); 535 536 entries = io_zcrx_rqring_entries(ifq); 537 entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); 538 if (unlikely(!entries)) { 539 spin_unlock_bh(&ifq->rq_lock); 540 return; 541 } 542 543 do { 544 struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); 545 struct io_zcrx_area *area; 546 struct net_iov *niov; 547 unsigned niov_idx, area_idx; 548 549 area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; 550 niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; 551 552 if (unlikely(rqe->__pad || area_idx)) 553 continue; 554 area = ifq->area; 555 556 if (unlikely(niov_idx >= area->nia.num_niovs)) 557 continue; 558 niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); 559 560 niov = &area->nia.niovs[niov_idx]; 561 if (!io_zcrx_put_niov_uref(niov)) 562 continue; 563 564 netmem = net_iov_to_netmem(niov); 565 if (page_pool_unref_netmem(netmem, 1) != 0) 566 continue; 567 568 if (unlikely(niov->pp != pp)) { 569 io_zcrx_return_niov(niov); 570 continue; 571 } 572 573 io_zcrx_sync_for_device(pp, niov); 574 net_mp_netmem_place_in_cache(pp, netmem); 575 } while (--entries); 576 577 smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); 578 spin_unlock_bh(&ifq->rq_lock); 579 } 580 581 static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) 582 { 583 struct io_zcrx_area *area = ifq->area; 584 585 spin_lock_bh(&area->freelist_lock); 586 while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { 587 struct net_iov *niov = __io_zcrx_get_free_niov(area); 588 netmem_ref netmem = net_iov_to_netmem(niov); 589 590 net_mp_niov_set_page_pool(pp, niov); 591 io_zcrx_sync_for_device(pp, niov); 592 net_mp_netmem_place_in_cache(pp, netmem); 593 } 594 spin_unlock_bh(&area->freelist_lock); 595 } 596 597 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) 598 { 599 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 600 601 /* pp should already be ensuring that */ 602 if (unlikely(pp->alloc.count)) 603 goto out_return; 604 605 io_zcrx_ring_refill(pp, ifq); 606 if (likely(pp->alloc.count)) 607 goto out_return; 608 609 io_zcrx_refill_slow(pp, ifq); 610 if (!pp->alloc.count) 611 return 0; 612 out_return: 613 return pp->alloc.cache[--pp->alloc.count]; 614 } 615 616 static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) 617 { 618 struct net_iov *niov; 619 620 if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 621 return false; 622 623 niov = netmem_to_net_iov(netmem); 624 net_mp_niov_clear_page_pool(niov); 625 io_zcrx_return_niov_freelist(niov); 626 return false; 627 } 628 629 static int io_pp_zc_init(struct page_pool *pp) 630 { 631 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 632 int ret; 633 634 if (WARN_ON_ONCE(!ifq)) 635 return -EINVAL; 636 if (WARN_ON_ONCE(ifq->dev != pp->p.dev)) 637 return -EINVAL; 638 if (WARN_ON_ONCE(!pp->dma_map)) 639 return -EOPNOTSUPP; 640 if (pp->p.order != 0) 641 return -EOPNOTSUPP; 642 if (pp->p.dma_dir != DMA_FROM_DEVICE) 643 return -EOPNOTSUPP; 644 645 ret = io_zcrx_map_area(ifq, ifq->area); 646 if (ret) 647 return ret; 648 649 percpu_ref_get(&ifq->ctx->refs); 650 return 0; 651 } 652 653 static void io_pp_zc_destroy(struct page_pool *pp) 654 { 655 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 656 struct io_zcrx_area *area = ifq->area; 657 658 if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) 659 return; 660 percpu_ref_put(&ifq->ctx->refs); 661 } 662 663 static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, 664 struct netdev_rx_queue *rxq) 665 { 666 struct nlattr *nest; 667 int type; 668 669 type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; 670 nest = nla_nest_start(rsp, type); 671 if (!nest) 672 return -EMSGSIZE; 673 nla_nest_end(rsp, nest); 674 675 return 0; 676 } 677 678 static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) 679 { 680 struct pp_memory_provider_params *p = &rxq->mp_params; 681 struct io_zcrx_ifq *ifq = mp_priv; 682 683 io_zcrx_drop_netdev(ifq); 684 if (ifq->area) 685 io_zcrx_unmap_area(ifq, ifq->area); 686 687 p->mp_ops = NULL; 688 p->mp_priv = NULL; 689 } 690 691 static const struct memory_provider_ops io_uring_pp_zc_ops = { 692 .alloc_netmems = io_pp_zc_alloc_netmems, 693 .release_netmem = io_pp_zc_release_netmem, 694 .init = io_pp_zc_init, 695 .destroy = io_pp_zc_destroy, 696 .nl_fill = io_pp_nl_fill, 697 .uninstall = io_pp_uninstall, 698 }; 699 700 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 701 struct io_zcrx_ifq *ifq, int off, int len) 702 { 703 struct io_uring_zcrx_cqe *rcqe; 704 struct io_zcrx_area *area; 705 struct io_uring_cqe *cqe; 706 u64 offset; 707 708 if (!io_defer_get_uncommited_cqe(req->ctx, &cqe)) 709 return false; 710 711 cqe->user_data = req->cqe.user_data; 712 cqe->res = len; 713 cqe->flags = IORING_CQE_F_MORE; 714 715 area = io_zcrx_iov_to_area(niov); 716 offset = off + (net_iov_idx(niov) << PAGE_SHIFT); 717 rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); 718 rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); 719 rcqe->__pad = 0; 720 return true; 721 } 722 723 static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) 724 { 725 struct net_iov *niov = NULL; 726 727 spin_lock_bh(&area->freelist_lock); 728 if (area->free_count) 729 niov = __io_zcrx_get_free_niov(area); 730 spin_unlock_bh(&area->freelist_lock); 731 732 if (niov) 733 page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); 734 return niov; 735 } 736 737 static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 738 void *src_base, struct page *src_page, 739 unsigned int src_offset, size_t len) 740 { 741 struct io_zcrx_area *area = ifq->area; 742 size_t copied = 0; 743 int ret = 0; 744 745 while (len) { 746 size_t copy_size = min_t(size_t, PAGE_SIZE, len); 747 const int dst_off = 0; 748 struct net_iov *niov; 749 struct page *dst_page; 750 void *dst_addr; 751 752 niov = io_zcrx_alloc_fallback(area); 753 if (!niov) { 754 ret = -ENOMEM; 755 break; 756 } 757 758 dst_page = io_zcrx_iov_page(niov); 759 dst_addr = kmap_local_page(dst_page); 760 if (src_page) 761 src_base = kmap_local_page(src_page); 762 763 memcpy(dst_addr, src_base + src_offset, copy_size); 764 765 if (src_page) 766 kunmap_local(src_base); 767 kunmap_local(dst_addr); 768 769 if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) { 770 io_zcrx_return_niov(niov); 771 ret = -ENOSPC; 772 break; 773 } 774 775 io_zcrx_get_niov_uref(niov); 776 src_offset += copy_size; 777 len -= copy_size; 778 copied += copy_size; 779 } 780 781 return copied ? copied : ret; 782 } 783 784 static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 785 const skb_frag_t *frag, int off, int len) 786 { 787 struct page *page = skb_frag_page(frag); 788 u32 p_off, p_len, t, copied = 0; 789 int ret = 0; 790 791 off += skb_frag_off(frag); 792 793 skb_frag_foreach_page(frag, off, len, 794 page, p_off, p_len, t) { 795 ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len); 796 if (ret < 0) 797 return copied ? copied : ret; 798 copied += ret; 799 } 800 return copied; 801 } 802 803 static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 804 const skb_frag_t *frag, int off, int len) 805 { 806 struct net_iov *niov; 807 808 if (unlikely(!skb_frag_is_net_iov(frag))) 809 return io_zcrx_copy_frag(req, ifq, frag, off, len); 810 811 niov = netmem_to_net_iov(frag->netmem); 812 if (niov->pp->mp_ops != &io_uring_pp_zc_ops || 813 io_pp_to_ifq(niov->pp) != ifq) 814 return -EFAULT; 815 816 if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) 817 return -ENOSPC; 818 819 /* 820 * Prevent it from being recycled while user is accessing it. 821 * It has to be done before grabbing a user reference. 822 */ 823 page_pool_ref_netmem(net_iov_to_netmem(niov)); 824 io_zcrx_get_niov_uref(niov); 825 return len; 826 } 827 828 static int 829 io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, 830 unsigned int offset, size_t len) 831 { 832 struct io_zcrx_args *args = desc->arg.data; 833 struct io_zcrx_ifq *ifq = args->ifq; 834 struct io_kiocb *req = args->req; 835 struct sk_buff *frag_iter; 836 unsigned start, start_off = offset; 837 int i, copy, end, off; 838 int ret = 0; 839 840 len = min_t(size_t, len, desc->count); 841 /* 842 * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even 843 * if desc->count is already 0. This is caused by the if (offset + 1 != 844 * skb->len) check. Return early in this case to break out of 845 * __tcp_read_sock(). 846 */ 847 if (!len) 848 return 0; 849 if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) 850 return -EAGAIN; 851 852 if (unlikely(offset < skb_headlen(skb))) { 853 ssize_t copied; 854 size_t to_copy; 855 856 to_copy = min_t(size_t, skb_headlen(skb) - offset, len); 857 copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL, 858 offset, to_copy); 859 if (copied < 0) { 860 ret = copied; 861 goto out; 862 } 863 offset += copied; 864 len -= copied; 865 if (!len) 866 goto out; 867 if (offset != skb_headlen(skb)) 868 goto out; 869 } 870 871 start = skb_headlen(skb); 872 873 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 874 const skb_frag_t *frag; 875 876 if (WARN_ON(start > offset + len)) 877 return -EFAULT; 878 879 frag = &skb_shinfo(skb)->frags[i]; 880 end = start + skb_frag_size(frag); 881 882 if (offset < end) { 883 copy = end - offset; 884 if (copy > len) 885 copy = len; 886 887 off = offset - start; 888 ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); 889 if (ret < 0) 890 goto out; 891 892 offset += ret; 893 len -= ret; 894 if (len == 0 || ret != copy) 895 goto out; 896 } 897 start = end; 898 } 899 900 skb_walk_frags(skb, frag_iter) { 901 if (WARN_ON(start > offset + len)) 902 return -EFAULT; 903 904 end = start + frag_iter->len; 905 if (offset < end) { 906 copy = end - offset; 907 if (copy > len) 908 copy = len; 909 910 off = offset - start; 911 ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); 912 if (ret < 0) 913 goto out; 914 915 offset += ret; 916 len -= ret; 917 if (len == 0 || ret != copy) 918 goto out; 919 } 920 start = end; 921 } 922 923 out: 924 if (offset == start_off) 925 return ret; 926 desc->count -= (offset - start_off); 927 return offset - start_off; 928 } 929 930 static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 931 struct sock *sk, int flags, 932 unsigned issue_flags, unsigned int *outlen) 933 { 934 unsigned int len = *outlen; 935 struct io_zcrx_args args = { 936 .req = req, 937 .ifq = ifq, 938 .sock = sk->sk_socket, 939 }; 940 read_descriptor_t rd_desc = { 941 .count = len ? len : UINT_MAX, 942 .arg.data = &args, 943 }; 944 int ret; 945 946 lock_sock(sk); 947 ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); 948 if (len && ret > 0) 949 *outlen = len - ret; 950 if (ret <= 0) { 951 if (ret < 0 || sock_flag(sk, SOCK_DONE)) 952 goto out; 953 if (sk->sk_err) 954 ret = sock_error(sk); 955 else if (sk->sk_shutdown & RCV_SHUTDOWN) 956 goto out; 957 else if (sk->sk_state == TCP_CLOSE) 958 ret = -ENOTCONN; 959 else 960 ret = -EAGAIN; 961 } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) && 962 (issue_flags & IO_URING_F_MULTISHOT)) { 963 ret = IOU_REQUEUE; 964 } else if (sock_flag(sk, SOCK_DONE)) { 965 /* Make it to retry until it finally gets 0. */ 966 if (issue_flags & IO_URING_F_MULTISHOT) 967 ret = IOU_REQUEUE; 968 else 969 ret = -EAGAIN; 970 } 971 out: 972 release_sock(sk); 973 return ret; 974 } 975 976 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 977 struct socket *sock, unsigned int flags, 978 unsigned issue_flags, unsigned int *len) 979 { 980 struct sock *sk = sock->sk; 981 const struct proto *prot = READ_ONCE(sk->sk_prot); 982 983 if (prot->recvmsg != tcp_recvmsg) 984 return -EPROTONOSUPPORT; 985 986 sock_rps_record_flow(sk); 987 return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); 988 } 989