1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/dma-map-ops.h> 5 #include <linux/mm.h> 6 #include <linux/nospec.h> 7 #include <linux/io_uring.h> 8 #include <linux/netdevice.h> 9 #include <linux/rtnetlink.h> 10 #include <linux/skbuff_ref.h> 11 12 #include <net/page_pool/helpers.h> 13 #include <net/page_pool/memory_provider.h> 14 #include <net/netlink.h> 15 #include <net/netdev_rx_queue.h> 16 #include <net/tcp.h> 17 #include <net/rps.h> 18 19 #include <trace/events/page_pool.h> 20 21 #include <uapi/linux/io_uring.h> 22 23 #include "io_uring.h" 24 #include "kbuf.h" 25 #include "memmap.h" 26 #include "zcrx.h" 27 #include "rsrc.h" 28 29 static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) 30 { 31 return pp->mp_priv; 32 } 33 34 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) 35 36 static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, 37 struct io_zcrx_area *area, int nr_mapped) 38 { 39 int i; 40 41 for (i = 0; i < nr_mapped; i++) { 42 struct net_iov *niov = &area->nia.niovs[i]; 43 dma_addr_t dma; 44 45 dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); 46 dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, 47 DMA_FROM_DEVICE, IO_DMA_ATTR); 48 net_mp_niov_set_dma_addr(niov, 0); 49 } 50 } 51 52 static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 53 { 54 guard(mutex)(&ifq->dma_lock); 55 56 if (area->is_mapped) 57 __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); 58 area->is_mapped = false; 59 } 60 61 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 62 { 63 int i; 64 65 guard(mutex)(&ifq->dma_lock); 66 if (area->is_mapped) 67 return 0; 68 69 for (i = 0; i < area->nia.num_niovs; i++) { 70 struct net_iov *niov = &area->nia.niovs[i]; 71 dma_addr_t dma; 72 73 dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE, 74 DMA_FROM_DEVICE, IO_DMA_ATTR); 75 if (dma_mapping_error(ifq->dev, dma)) 76 break; 77 if (net_mp_niov_set_dma_addr(niov, dma)) { 78 dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, 79 DMA_FROM_DEVICE, IO_DMA_ATTR); 80 break; 81 } 82 } 83 84 if (i != area->nia.num_niovs) { 85 __io_zcrx_unmap_area(ifq, area, i); 86 return -EINVAL; 87 } 88 89 area->is_mapped = true; 90 return 0; 91 } 92 93 static void io_zcrx_sync_for_device(const struct page_pool *pool, 94 struct net_iov *niov) 95 { 96 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 97 dma_addr_t dma_addr; 98 99 if (!dma_dev_need_sync(pool->p.dev)) 100 return; 101 102 dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); 103 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, 104 PAGE_SIZE, pool->p.dma_dir); 105 #endif 106 } 107 108 #define IO_RQ_MAX_ENTRIES 32768 109 110 #define IO_SKBS_PER_CALL_LIMIT 20 111 112 struct io_zcrx_args { 113 struct io_kiocb *req; 114 struct io_zcrx_ifq *ifq; 115 struct socket *sock; 116 unsigned nr_skbs; 117 }; 118 119 static const struct memory_provider_ops io_uring_pp_zc_ops; 120 121 static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) 122 { 123 struct net_iov_area *owner = net_iov_owner(niov); 124 125 return container_of(owner, struct io_zcrx_area, nia); 126 } 127 128 static inline atomic_t *io_get_user_counter(struct net_iov *niov) 129 { 130 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 131 132 return &area->user_refs[net_iov_idx(niov)]; 133 } 134 135 static bool io_zcrx_put_niov_uref(struct net_iov *niov) 136 { 137 atomic_t *uref = io_get_user_counter(niov); 138 139 if (unlikely(!atomic_read(uref))) 140 return false; 141 atomic_dec(uref); 142 return true; 143 } 144 145 static void io_zcrx_get_niov_uref(struct net_iov *niov) 146 { 147 atomic_inc(io_get_user_counter(niov)); 148 } 149 150 static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) 151 { 152 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 153 154 return area->pages[net_iov_idx(niov)]; 155 } 156 157 static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, 158 struct io_uring_zcrx_ifq_reg *reg, 159 struct io_uring_region_desc *rd) 160 { 161 size_t off, size; 162 void *ptr; 163 int ret; 164 165 off = sizeof(struct io_uring); 166 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 167 if (size > rd->size) 168 return -EINVAL; 169 170 ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd, 171 IORING_MAP_OFF_ZCRX_REGION); 172 if (ret < 0) 173 return ret; 174 175 ptr = io_region_get_ptr(&ifq->ctx->zcrx_region); 176 ifq->rq_ring = (struct io_uring *)ptr; 177 ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 178 return 0; 179 } 180 181 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) 182 { 183 io_free_region(ifq->ctx, &ifq->ctx->zcrx_region); 184 ifq->rq_ring = NULL; 185 ifq->rqes = NULL; 186 } 187 188 static void io_zcrx_free_area(struct io_zcrx_area *area) 189 { 190 io_zcrx_unmap_area(area->ifq, area); 191 192 kvfree(area->freelist); 193 kvfree(area->nia.niovs); 194 kvfree(area->user_refs); 195 if (area->pages) { 196 unpin_user_pages(area->pages, area->nr_folios); 197 kvfree(area->pages); 198 } 199 kfree(area); 200 } 201 202 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, 203 struct io_zcrx_area **res, 204 struct io_uring_zcrx_area_reg *area_reg) 205 { 206 struct io_zcrx_area *area; 207 int i, ret, nr_pages, nr_iovs; 208 struct iovec iov; 209 210 if (area_reg->flags || area_reg->rq_area_token) 211 return -EINVAL; 212 if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) 213 return -EINVAL; 214 if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) 215 return -EINVAL; 216 217 iov.iov_base = u64_to_user_ptr(area_reg->addr); 218 iov.iov_len = area_reg->len; 219 ret = io_buffer_validate(&iov); 220 if (ret) 221 return ret; 222 223 ret = -ENOMEM; 224 area = kzalloc(sizeof(*area), GFP_KERNEL); 225 if (!area) 226 goto err; 227 228 area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, 229 &nr_pages); 230 if (IS_ERR(area->pages)) { 231 ret = PTR_ERR(area->pages); 232 area->pages = NULL; 233 goto err; 234 } 235 area->nr_folios = nr_iovs = nr_pages; 236 area->nia.num_niovs = nr_iovs; 237 238 area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), 239 GFP_KERNEL | __GFP_ZERO); 240 if (!area->nia.niovs) 241 goto err; 242 243 area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), 244 GFP_KERNEL | __GFP_ZERO); 245 if (!area->freelist) 246 goto err; 247 248 for (i = 0; i < nr_iovs; i++) 249 area->freelist[i] = i; 250 251 area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), 252 GFP_KERNEL | __GFP_ZERO); 253 if (!area->user_refs) 254 goto err; 255 256 for (i = 0; i < nr_iovs; i++) { 257 struct net_iov *niov = &area->nia.niovs[i]; 258 259 niov->owner = &area->nia; 260 area->freelist[i] = i; 261 atomic_set(&area->user_refs[i], 0); 262 niov->type = NET_IOV_IOURING; 263 } 264 265 area->free_count = nr_iovs; 266 area->ifq = ifq; 267 /* we're only supporting one area per ifq for now */ 268 area->area_id = 0; 269 area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; 270 spin_lock_init(&area->freelist_lock); 271 *res = area; 272 return 0; 273 err: 274 if (area) 275 io_zcrx_free_area(area); 276 return ret; 277 } 278 279 static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) 280 { 281 struct io_zcrx_ifq *ifq; 282 283 ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); 284 if (!ifq) 285 return NULL; 286 287 ifq->if_rxq = -1; 288 ifq->ctx = ctx; 289 spin_lock_init(&ifq->lock); 290 spin_lock_init(&ifq->rq_lock); 291 mutex_init(&ifq->dma_lock); 292 return ifq; 293 } 294 295 static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) 296 { 297 spin_lock(&ifq->lock); 298 if (ifq->netdev) { 299 netdev_put(ifq->netdev, &ifq->netdev_tracker); 300 ifq->netdev = NULL; 301 } 302 spin_unlock(&ifq->lock); 303 } 304 305 static void io_close_queue(struct io_zcrx_ifq *ifq) 306 { 307 struct net_device *netdev; 308 netdevice_tracker netdev_tracker; 309 struct pp_memory_provider_params p = { 310 .mp_ops = &io_uring_pp_zc_ops, 311 .mp_priv = ifq, 312 }; 313 314 if (ifq->if_rxq == -1) 315 return; 316 317 spin_lock(&ifq->lock); 318 netdev = ifq->netdev; 319 netdev_tracker = ifq->netdev_tracker; 320 ifq->netdev = NULL; 321 spin_unlock(&ifq->lock); 322 323 if (netdev) { 324 net_mp_close_rxq(netdev, ifq->if_rxq, &p); 325 netdev_put(netdev, &netdev_tracker); 326 } 327 ifq->if_rxq = -1; 328 } 329 330 static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) 331 { 332 io_close_queue(ifq); 333 io_zcrx_drop_netdev(ifq); 334 335 if (ifq->area) 336 io_zcrx_free_area(ifq->area); 337 if (ifq->dev) 338 put_device(ifq->dev); 339 340 io_free_rbuf_ring(ifq); 341 mutex_destroy(&ifq->dma_lock); 342 kfree(ifq); 343 } 344 345 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 346 struct io_uring_zcrx_ifq_reg __user *arg) 347 { 348 struct pp_memory_provider_params mp_param = {}; 349 struct io_uring_zcrx_area_reg area; 350 struct io_uring_zcrx_ifq_reg reg; 351 struct io_uring_region_desc rd; 352 struct io_zcrx_ifq *ifq; 353 int ret; 354 355 /* 356 * 1. Interface queue allocation. 357 * 2. It can observe data destined for sockets of other tasks. 358 */ 359 if (!capable(CAP_NET_ADMIN)) 360 return -EPERM; 361 362 /* mandatory io_uring features for zc rx */ 363 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && 364 ctx->flags & IORING_SETUP_CQE32)) 365 return -EINVAL; 366 if (ctx->ifq) 367 return -EBUSY; 368 if (copy_from_user(®, arg, sizeof(reg))) 369 return -EFAULT; 370 if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 371 return -EFAULT; 372 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)) || 373 reg.__resv2 || reg.zcrx_id) 374 return -EINVAL; 375 if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) 376 return -EINVAL; 377 if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { 378 if (!(ctx->flags & IORING_SETUP_CLAMP)) 379 return -EINVAL; 380 reg.rq_entries = IO_RQ_MAX_ENTRIES; 381 } 382 reg.rq_entries = roundup_pow_of_two(reg.rq_entries); 383 384 if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) 385 return -EFAULT; 386 387 ifq = io_zcrx_ifq_alloc(ctx); 388 if (!ifq) 389 return -ENOMEM; 390 391 ret = io_allocate_rbuf_ring(ifq, ®, &rd); 392 if (ret) 393 goto err; 394 395 ret = io_zcrx_create_area(ifq, &ifq->area, &area); 396 if (ret) 397 goto err; 398 399 ifq->rq_entries = reg.rq_entries; 400 401 ret = -ENODEV; 402 ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, 403 &ifq->netdev_tracker, GFP_KERNEL); 404 if (!ifq->netdev) 405 goto err; 406 407 ifq->dev = ifq->netdev->dev.parent; 408 ret = -EOPNOTSUPP; 409 if (!ifq->dev) 410 goto err; 411 get_device(ifq->dev); 412 413 mp_param.mp_ops = &io_uring_pp_zc_ops; 414 mp_param.mp_priv = ifq; 415 ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); 416 if (ret) 417 goto err; 418 ifq->if_rxq = reg.if_rxq; 419 420 reg.offsets.rqes = sizeof(struct io_uring); 421 reg.offsets.head = offsetof(struct io_uring, head); 422 reg.offsets.tail = offsetof(struct io_uring, tail); 423 424 if (copy_to_user(arg, ®, sizeof(reg)) || 425 copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || 426 copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { 427 ret = -EFAULT; 428 goto err; 429 } 430 ctx->ifq = ifq; 431 return 0; 432 err: 433 io_zcrx_ifq_free(ifq); 434 return ret; 435 } 436 437 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) 438 { 439 struct io_zcrx_ifq *ifq = ctx->ifq; 440 441 lockdep_assert_held(&ctx->uring_lock); 442 443 if (!ifq) 444 return; 445 446 ctx->ifq = NULL; 447 io_zcrx_ifq_free(ifq); 448 } 449 450 static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) 451 { 452 unsigned niov_idx; 453 454 lockdep_assert_held(&area->freelist_lock); 455 456 niov_idx = area->freelist[--area->free_count]; 457 return &area->nia.niovs[niov_idx]; 458 } 459 460 static void io_zcrx_return_niov_freelist(struct net_iov *niov) 461 { 462 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 463 464 spin_lock_bh(&area->freelist_lock); 465 area->freelist[area->free_count++] = net_iov_idx(niov); 466 spin_unlock_bh(&area->freelist_lock); 467 } 468 469 static void io_zcrx_return_niov(struct net_iov *niov) 470 { 471 netmem_ref netmem = net_iov_to_netmem(niov); 472 473 if (!niov->pp) { 474 /* copy fallback allocated niovs */ 475 io_zcrx_return_niov_freelist(niov); 476 return; 477 } 478 page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); 479 } 480 481 static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 482 { 483 struct io_zcrx_area *area = ifq->area; 484 int i; 485 486 if (!area) 487 return; 488 489 /* Reclaim back all buffers given to the user space. */ 490 for (i = 0; i < area->nia.num_niovs; i++) { 491 struct net_iov *niov = &area->nia.niovs[i]; 492 int nr; 493 494 if (!atomic_read(io_get_user_counter(niov))) 495 continue; 496 nr = atomic_xchg(io_get_user_counter(niov), 0); 497 if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 498 io_zcrx_return_niov(niov); 499 } 500 } 501 502 void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) 503 { 504 lockdep_assert_held(&ctx->uring_lock); 505 506 if (!ctx->ifq) 507 return; 508 io_zcrx_scrub(ctx->ifq); 509 io_close_queue(ctx->ifq); 510 } 511 512 static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) 513 { 514 u32 entries; 515 516 entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; 517 return min(entries, ifq->rq_entries); 518 } 519 520 static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, 521 unsigned mask) 522 { 523 unsigned int idx = ifq->cached_rq_head++ & mask; 524 525 return &ifq->rqes[idx]; 526 } 527 528 static void io_zcrx_ring_refill(struct page_pool *pp, 529 struct io_zcrx_ifq *ifq) 530 { 531 unsigned int mask = ifq->rq_entries - 1; 532 unsigned int entries; 533 netmem_ref netmem; 534 535 spin_lock_bh(&ifq->rq_lock); 536 537 entries = io_zcrx_rqring_entries(ifq); 538 entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); 539 if (unlikely(!entries)) { 540 spin_unlock_bh(&ifq->rq_lock); 541 return; 542 } 543 544 do { 545 struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); 546 struct io_zcrx_area *area; 547 struct net_iov *niov; 548 unsigned niov_idx, area_idx; 549 550 area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; 551 niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; 552 553 if (unlikely(rqe->__pad || area_idx)) 554 continue; 555 area = ifq->area; 556 557 if (unlikely(niov_idx >= area->nia.num_niovs)) 558 continue; 559 niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); 560 561 niov = &area->nia.niovs[niov_idx]; 562 if (!io_zcrx_put_niov_uref(niov)) 563 continue; 564 565 netmem = net_iov_to_netmem(niov); 566 if (page_pool_unref_netmem(netmem, 1) != 0) 567 continue; 568 569 if (unlikely(niov->pp != pp)) { 570 io_zcrx_return_niov(niov); 571 continue; 572 } 573 574 io_zcrx_sync_for_device(pp, niov); 575 net_mp_netmem_place_in_cache(pp, netmem); 576 } while (--entries); 577 578 smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); 579 spin_unlock_bh(&ifq->rq_lock); 580 } 581 582 static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) 583 { 584 struct io_zcrx_area *area = ifq->area; 585 586 spin_lock_bh(&area->freelist_lock); 587 while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { 588 struct net_iov *niov = __io_zcrx_get_free_niov(area); 589 netmem_ref netmem = net_iov_to_netmem(niov); 590 591 net_mp_niov_set_page_pool(pp, niov); 592 io_zcrx_sync_for_device(pp, niov); 593 net_mp_netmem_place_in_cache(pp, netmem); 594 } 595 spin_unlock_bh(&area->freelist_lock); 596 } 597 598 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) 599 { 600 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 601 602 /* pp should already be ensuring that */ 603 if (unlikely(pp->alloc.count)) 604 goto out_return; 605 606 io_zcrx_ring_refill(pp, ifq); 607 if (likely(pp->alloc.count)) 608 goto out_return; 609 610 io_zcrx_refill_slow(pp, ifq); 611 if (!pp->alloc.count) 612 return 0; 613 out_return: 614 return pp->alloc.cache[--pp->alloc.count]; 615 } 616 617 static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) 618 { 619 struct net_iov *niov; 620 621 if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 622 return false; 623 624 niov = netmem_to_net_iov(netmem); 625 net_mp_niov_clear_page_pool(niov); 626 io_zcrx_return_niov_freelist(niov); 627 return false; 628 } 629 630 static int io_pp_zc_init(struct page_pool *pp) 631 { 632 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 633 int ret; 634 635 if (WARN_ON_ONCE(!ifq)) 636 return -EINVAL; 637 if (WARN_ON_ONCE(ifq->dev != pp->p.dev)) 638 return -EINVAL; 639 if (WARN_ON_ONCE(!pp->dma_map)) 640 return -EOPNOTSUPP; 641 if (pp->p.order != 0) 642 return -EOPNOTSUPP; 643 if (pp->p.dma_dir != DMA_FROM_DEVICE) 644 return -EOPNOTSUPP; 645 646 ret = io_zcrx_map_area(ifq, ifq->area); 647 if (ret) 648 return ret; 649 650 percpu_ref_get(&ifq->ctx->refs); 651 return 0; 652 } 653 654 static void io_pp_zc_destroy(struct page_pool *pp) 655 { 656 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 657 struct io_zcrx_area *area = ifq->area; 658 659 if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) 660 return; 661 percpu_ref_put(&ifq->ctx->refs); 662 } 663 664 static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, 665 struct netdev_rx_queue *rxq) 666 { 667 struct nlattr *nest; 668 int type; 669 670 type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; 671 nest = nla_nest_start(rsp, type); 672 if (!nest) 673 return -EMSGSIZE; 674 nla_nest_end(rsp, nest); 675 676 return 0; 677 } 678 679 static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) 680 { 681 struct pp_memory_provider_params *p = &rxq->mp_params; 682 struct io_zcrx_ifq *ifq = mp_priv; 683 684 io_zcrx_drop_netdev(ifq); 685 if (ifq->area) 686 io_zcrx_unmap_area(ifq, ifq->area); 687 688 p->mp_ops = NULL; 689 p->mp_priv = NULL; 690 } 691 692 static const struct memory_provider_ops io_uring_pp_zc_ops = { 693 .alloc_netmems = io_pp_zc_alloc_netmems, 694 .release_netmem = io_pp_zc_release_netmem, 695 .init = io_pp_zc_init, 696 .destroy = io_pp_zc_destroy, 697 .nl_fill = io_pp_nl_fill, 698 .uninstall = io_pp_uninstall, 699 }; 700 701 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 702 struct io_zcrx_ifq *ifq, int off, int len) 703 { 704 struct io_uring_zcrx_cqe *rcqe; 705 struct io_zcrx_area *area; 706 struct io_uring_cqe *cqe; 707 u64 offset; 708 709 if (!io_defer_get_uncommited_cqe(req->ctx, &cqe)) 710 return false; 711 712 cqe->user_data = req->cqe.user_data; 713 cqe->res = len; 714 cqe->flags = IORING_CQE_F_MORE; 715 716 area = io_zcrx_iov_to_area(niov); 717 offset = off + (net_iov_idx(niov) << PAGE_SHIFT); 718 rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); 719 rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); 720 rcqe->__pad = 0; 721 return true; 722 } 723 724 static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) 725 { 726 struct net_iov *niov = NULL; 727 728 spin_lock_bh(&area->freelist_lock); 729 if (area->free_count) 730 niov = __io_zcrx_get_free_niov(area); 731 spin_unlock_bh(&area->freelist_lock); 732 733 if (niov) 734 page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); 735 return niov; 736 } 737 738 static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 739 void *src_base, struct page *src_page, 740 unsigned int src_offset, size_t len) 741 { 742 struct io_zcrx_area *area = ifq->area; 743 size_t copied = 0; 744 int ret = 0; 745 746 while (len) { 747 size_t copy_size = min_t(size_t, PAGE_SIZE, len); 748 const int dst_off = 0; 749 struct net_iov *niov; 750 struct page *dst_page; 751 void *dst_addr; 752 753 niov = io_zcrx_alloc_fallback(area); 754 if (!niov) { 755 ret = -ENOMEM; 756 break; 757 } 758 759 dst_page = io_zcrx_iov_page(niov); 760 dst_addr = kmap_local_page(dst_page); 761 if (src_page) 762 src_base = kmap_local_page(src_page); 763 764 memcpy(dst_addr, src_base + src_offset, copy_size); 765 766 if (src_page) 767 kunmap_local(src_base); 768 kunmap_local(dst_addr); 769 770 if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) { 771 io_zcrx_return_niov(niov); 772 ret = -ENOSPC; 773 break; 774 } 775 776 io_zcrx_get_niov_uref(niov); 777 src_offset += copy_size; 778 len -= copy_size; 779 copied += copy_size; 780 } 781 782 return copied ? copied : ret; 783 } 784 785 static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 786 const skb_frag_t *frag, int off, int len) 787 { 788 struct page *page = skb_frag_page(frag); 789 u32 p_off, p_len, t, copied = 0; 790 int ret = 0; 791 792 off += skb_frag_off(frag); 793 794 skb_frag_foreach_page(frag, off, len, 795 page, p_off, p_len, t) { 796 ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len); 797 if (ret < 0) 798 return copied ? copied : ret; 799 copied += ret; 800 } 801 return copied; 802 } 803 804 static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 805 const skb_frag_t *frag, int off, int len) 806 { 807 struct net_iov *niov; 808 809 if (unlikely(!skb_frag_is_net_iov(frag))) 810 return io_zcrx_copy_frag(req, ifq, frag, off, len); 811 812 niov = netmem_to_net_iov(frag->netmem); 813 if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || 814 io_pp_to_ifq(niov->pp) != ifq) 815 return -EFAULT; 816 817 if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) 818 return -ENOSPC; 819 820 /* 821 * Prevent it from being recycled while user is accessing it. 822 * It has to be done before grabbing a user reference. 823 */ 824 page_pool_ref_netmem(net_iov_to_netmem(niov)); 825 io_zcrx_get_niov_uref(niov); 826 return len; 827 } 828 829 static int 830 io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, 831 unsigned int offset, size_t len) 832 { 833 struct io_zcrx_args *args = desc->arg.data; 834 struct io_zcrx_ifq *ifq = args->ifq; 835 struct io_kiocb *req = args->req; 836 struct sk_buff *frag_iter; 837 unsigned start, start_off = offset; 838 int i, copy, end, off; 839 int ret = 0; 840 841 len = min_t(size_t, len, desc->count); 842 /* 843 * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even 844 * if desc->count is already 0. This is caused by the if (offset + 1 != 845 * skb->len) check. Return early in this case to break out of 846 * __tcp_read_sock(). 847 */ 848 if (!len) 849 return 0; 850 if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) 851 return -EAGAIN; 852 853 if (unlikely(offset < skb_headlen(skb))) { 854 ssize_t copied; 855 size_t to_copy; 856 857 to_copy = min_t(size_t, skb_headlen(skb) - offset, len); 858 copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL, 859 offset, to_copy); 860 if (copied < 0) { 861 ret = copied; 862 goto out; 863 } 864 offset += copied; 865 len -= copied; 866 if (!len) 867 goto out; 868 if (offset != skb_headlen(skb)) 869 goto out; 870 } 871 872 start = skb_headlen(skb); 873 874 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 875 const skb_frag_t *frag; 876 877 if (WARN_ON(start > offset + len)) 878 return -EFAULT; 879 880 frag = &skb_shinfo(skb)->frags[i]; 881 end = start + skb_frag_size(frag); 882 883 if (offset < end) { 884 copy = end - offset; 885 if (copy > len) 886 copy = len; 887 888 off = offset - start; 889 ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); 890 if (ret < 0) 891 goto out; 892 893 offset += ret; 894 len -= ret; 895 if (len == 0 || ret != copy) 896 goto out; 897 } 898 start = end; 899 } 900 901 skb_walk_frags(skb, frag_iter) { 902 if (WARN_ON(start > offset + len)) 903 return -EFAULT; 904 905 end = start + frag_iter->len; 906 if (offset < end) { 907 copy = end - offset; 908 if (copy > len) 909 copy = len; 910 911 off = offset - start; 912 ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); 913 if (ret < 0) 914 goto out; 915 916 offset += ret; 917 len -= ret; 918 if (len == 0 || ret != copy) 919 goto out; 920 } 921 start = end; 922 } 923 924 out: 925 if (offset == start_off) 926 return ret; 927 desc->count -= (offset - start_off); 928 return offset - start_off; 929 } 930 931 static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 932 struct sock *sk, int flags, 933 unsigned issue_flags, unsigned int *outlen) 934 { 935 unsigned int len = *outlen; 936 struct io_zcrx_args args = { 937 .req = req, 938 .ifq = ifq, 939 .sock = sk->sk_socket, 940 }; 941 read_descriptor_t rd_desc = { 942 .count = len ? len : UINT_MAX, 943 .arg.data = &args, 944 }; 945 int ret; 946 947 lock_sock(sk); 948 ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); 949 if (len && ret > 0) 950 *outlen = len - ret; 951 if (ret <= 0) { 952 if (ret < 0 || sock_flag(sk, SOCK_DONE)) 953 goto out; 954 if (sk->sk_err) 955 ret = sock_error(sk); 956 else if (sk->sk_shutdown & RCV_SHUTDOWN) 957 goto out; 958 else if (sk->sk_state == TCP_CLOSE) 959 ret = -ENOTCONN; 960 else 961 ret = -EAGAIN; 962 } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) && 963 (issue_flags & IO_URING_F_MULTISHOT)) { 964 ret = IOU_REQUEUE; 965 } else if (sock_flag(sk, SOCK_DONE)) { 966 /* Make it to retry until it finally gets 0. */ 967 if (issue_flags & IO_URING_F_MULTISHOT) 968 ret = IOU_REQUEUE; 969 else 970 ret = -EAGAIN; 971 } 972 out: 973 release_sock(sk); 974 return ret; 975 } 976 977 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 978 struct socket *sock, unsigned int flags, 979 unsigned issue_flags, unsigned int *len) 980 { 981 struct sock *sk = sock->sk; 982 const struct proto *prot = READ_ONCE(sk->sk_prot); 983 984 if (prot->recvmsg != tcp_recvmsg) 985 return -EPROTONOSUPPORT; 986 987 sock_rps_record_flow(sk); 988 return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); 989 } 990