1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/dma-map-ops.h> 5 #include <linux/mm.h> 6 #include <linux/nospec.h> 7 #include <linux/io_uring.h> 8 #include <linux/netdevice.h> 9 #include <linux/rtnetlink.h> 10 #include <linux/skbuff_ref.h> 11 12 #include <net/page_pool/helpers.h> 13 #include <net/page_pool/memory_provider.h> 14 #include <net/netlink.h> 15 #include <net/netdev_rx_queue.h> 16 #include <net/tcp.h> 17 #include <net/rps.h> 18 19 #include <trace/events/page_pool.h> 20 21 #include <uapi/linux/io_uring.h> 22 23 #include "io_uring.h" 24 #include "kbuf.h" 25 #include "memmap.h" 26 #include "zcrx.h" 27 #include "rsrc.h" 28 29 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) 30 31 static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) 32 { 33 return pp->mp_priv; 34 } 35 36 static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) 37 { 38 struct net_iov_area *owner = net_iov_owner(niov); 39 40 return container_of(owner, struct io_zcrx_area, nia); 41 } 42 43 static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) 44 { 45 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 46 47 return area->mem.pages[net_iov_idx(niov)]; 48 } 49 50 static void io_release_dmabuf(struct io_zcrx_mem *mem) 51 { 52 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 53 return; 54 55 if (mem->sgt) 56 dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, 57 DMA_FROM_DEVICE); 58 if (mem->attach) 59 dma_buf_detach(mem->dmabuf, mem->attach); 60 if (mem->dmabuf) 61 dma_buf_put(mem->dmabuf); 62 63 mem->sgt = NULL; 64 mem->attach = NULL; 65 mem->dmabuf = NULL; 66 } 67 68 static int io_import_dmabuf(struct io_zcrx_ifq *ifq, 69 struct io_zcrx_mem *mem, 70 struct io_uring_zcrx_area_reg *area_reg) 71 { 72 unsigned long off = (unsigned long)area_reg->addr; 73 unsigned long len = (unsigned long)area_reg->len; 74 unsigned long total_size = 0; 75 struct scatterlist *sg; 76 int dmabuf_fd = area_reg->dmabuf_fd; 77 int i, ret; 78 79 if (off) 80 return -EINVAL; 81 if (WARN_ON_ONCE(!ifq->dev)) 82 return -EFAULT; 83 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 84 return -EINVAL; 85 86 mem->is_dmabuf = true; 87 mem->dmabuf = dma_buf_get(dmabuf_fd); 88 if (IS_ERR(mem->dmabuf)) { 89 ret = PTR_ERR(mem->dmabuf); 90 mem->dmabuf = NULL; 91 goto err; 92 } 93 94 mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); 95 if (IS_ERR(mem->attach)) { 96 ret = PTR_ERR(mem->attach); 97 mem->attach = NULL; 98 goto err; 99 } 100 101 mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); 102 if (IS_ERR(mem->sgt)) { 103 ret = PTR_ERR(mem->sgt); 104 mem->sgt = NULL; 105 goto err; 106 } 107 108 for_each_sgtable_dma_sg(mem->sgt, sg, i) 109 total_size += sg_dma_len(sg); 110 111 if (total_size != len) { 112 ret = -EINVAL; 113 goto err; 114 } 115 116 mem->dmabuf_offset = off; 117 mem->size = len; 118 return 0; 119 err: 120 io_release_dmabuf(mem); 121 return ret; 122 } 123 124 static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 125 { 126 unsigned long off = area->mem.dmabuf_offset; 127 struct scatterlist *sg; 128 unsigned i, niov_idx = 0; 129 130 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 131 return -EINVAL; 132 133 for_each_sgtable_dma_sg(area->mem.sgt, sg, i) { 134 dma_addr_t dma = sg_dma_address(sg); 135 unsigned long sg_len = sg_dma_len(sg); 136 unsigned long sg_off = min(sg_len, off); 137 138 off -= sg_off; 139 sg_len -= sg_off; 140 dma += sg_off; 141 142 while (sg_len && niov_idx < area->nia.num_niovs) { 143 struct net_iov *niov = &area->nia.niovs[niov_idx]; 144 145 if (net_mp_niov_set_dma_addr(niov, dma)) 146 return 0; 147 sg_len -= PAGE_SIZE; 148 dma += PAGE_SIZE; 149 niov_idx++; 150 } 151 } 152 return niov_idx; 153 } 154 155 static int io_import_umem(struct io_zcrx_ifq *ifq, 156 struct io_zcrx_mem *mem, 157 struct io_uring_zcrx_area_reg *area_reg) 158 { 159 struct page **pages; 160 int nr_pages; 161 162 if (area_reg->dmabuf_fd) 163 return -EINVAL; 164 if (!area_reg->addr) 165 return -EFAULT; 166 pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, 167 &nr_pages); 168 if (IS_ERR(pages)) 169 return PTR_ERR(pages); 170 171 mem->pages = pages; 172 mem->nr_folios = nr_pages; 173 mem->size = area_reg->len; 174 return 0; 175 } 176 177 static void io_release_area_mem(struct io_zcrx_mem *mem) 178 { 179 if (mem->is_dmabuf) { 180 io_release_dmabuf(mem); 181 return; 182 } 183 if (mem->pages) { 184 unpin_user_pages(mem->pages, mem->nr_folios); 185 kvfree(mem->pages); 186 } 187 } 188 189 static int io_import_area(struct io_zcrx_ifq *ifq, 190 struct io_zcrx_mem *mem, 191 struct io_uring_zcrx_area_reg *area_reg) 192 { 193 int ret; 194 195 ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); 196 if (ret) 197 return ret; 198 if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) 199 return -EINVAL; 200 201 if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) 202 return io_import_dmabuf(ifq, mem, area_reg); 203 return io_import_umem(ifq, mem, area_reg); 204 } 205 206 static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq, 207 struct io_zcrx_area *area, int nr_mapped) 208 { 209 int i; 210 211 for (i = 0; i < nr_mapped; i++) { 212 netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]); 213 dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem); 214 215 dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, 216 DMA_FROM_DEVICE, IO_DMA_ATTR); 217 } 218 } 219 220 static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, 221 struct io_zcrx_area *area, int nr_mapped) 222 { 223 int i; 224 225 if (area->mem.is_dmabuf) 226 io_release_dmabuf(&area->mem); 227 else 228 io_zcrx_unmap_umem(ifq, area, nr_mapped); 229 230 for (i = 0; i < area->nia.num_niovs; i++) 231 net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); 232 } 233 234 static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 235 { 236 guard(mutex)(&ifq->dma_lock); 237 238 if (area->is_mapped) 239 __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); 240 area->is_mapped = false; 241 } 242 243 static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 244 { 245 int i; 246 247 for (i = 0; i < area->nia.num_niovs; i++) { 248 struct net_iov *niov = &area->nia.niovs[i]; 249 dma_addr_t dma; 250 251 dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0, 252 PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR); 253 if (dma_mapping_error(ifq->dev, dma)) 254 break; 255 if (net_mp_niov_set_dma_addr(niov, dma)) { 256 dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, 257 DMA_FROM_DEVICE, IO_DMA_ATTR); 258 break; 259 } 260 } 261 return i; 262 } 263 264 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 265 { 266 unsigned nr; 267 268 guard(mutex)(&ifq->dma_lock); 269 if (area->is_mapped) 270 return 0; 271 272 if (area->mem.is_dmabuf) 273 nr = io_zcrx_map_area_dmabuf(ifq, area); 274 else 275 nr = io_zcrx_map_area_umem(ifq, area); 276 277 if (nr != area->nia.num_niovs) { 278 __io_zcrx_unmap_area(ifq, area, nr); 279 return -EINVAL; 280 } 281 282 area->is_mapped = true; 283 return 0; 284 } 285 286 static void io_zcrx_sync_for_device(const struct page_pool *pool, 287 struct net_iov *niov) 288 { 289 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 290 dma_addr_t dma_addr; 291 292 if (!dma_dev_need_sync(pool->p.dev)) 293 return; 294 295 dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); 296 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, 297 PAGE_SIZE, pool->p.dma_dir); 298 #endif 299 } 300 301 #define IO_RQ_MAX_ENTRIES 32768 302 303 #define IO_SKBS_PER_CALL_LIMIT 20 304 305 struct io_zcrx_args { 306 struct io_kiocb *req; 307 struct io_zcrx_ifq *ifq; 308 struct socket *sock; 309 unsigned nr_skbs; 310 }; 311 312 static const struct memory_provider_ops io_uring_pp_zc_ops; 313 314 static inline atomic_t *io_get_user_counter(struct net_iov *niov) 315 { 316 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 317 318 return &area->user_refs[net_iov_idx(niov)]; 319 } 320 321 static bool io_zcrx_put_niov_uref(struct net_iov *niov) 322 { 323 atomic_t *uref = io_get_user_counter(niov); 324 325 if (unlikely(!atomic_read(uref))) 326 return false; 327 atomic_dec(uref); 328 return true; 329 } 330 331 static void io_zcrx_get_niov_uref(struct net_iov *niov) 332 { 333 atomic_inc(io_get_user_counter(niov)); 334 } 335 336 static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, 337 struct io_uring_zcrx_ifq_reg *reg, 338 struct io_uring_region_desc *rd, 339 u32 id) 340 { 341 u64 mmap_offset; 342 size_t off, size; 343 void *ptr; 344 int ret; 345 346 off = sizeof(struct io_uring); 347 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 348 if (size > rd->size) 349 return -EINVAL; 350 351 mmap_offset = IORING_MAP_OFF_ZCRX_REGION; 352 mmap_offset += id << IORING_OFF_PBUF_SHIFT; 353 354 ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); 355 if (ret < 0) 356 return ret; 357 358 ptr = io_region_get_ptr(&ifq->region); 359 ifq->rq_ring = (struct io_uring *)ptr; 360 ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 361 return 0; 362 } 363 364 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) 365 { 366 io_free_region(ifq->ctx, &ifq->region); 367 ifq->rq_ring = NULL; 368 ifq->rqes = NULL; 369 } 370 371 static void io_zcrx_free_area(struct io_zcrx_area *area) 372 { 373 if (area->ifq) 374 io_zcrx_unmap_area(area->ifq, area); 375 io_release_area_mem(&area->mem); 376 377 kvfree(area->freelist); 378 kvfree(area->nia.niovs); 379 kvfree(area->user_refs); 380 kfree(area); 381 } 382 383 #define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) 384 385 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, 386 struct io_zcrx_area **res, 387 struct io_uring_zcrx_area_reg *area_reg) 388 { 389 struct io_zcrx_area *area; 390 unsigned nr_iovs; 391 int i, ret; 392 393 if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) 394 return -EINVAL; 395 if (area_reg->rq_area_token) 396 return -EINVAL; 397 if (area_reg->__resv2[0] || area_reg->__resv2[1]) 398 return -EINVAL; 399 400 ret = -ENOMEM; 401 area = kzalloc(sizeof(*area), GFP_KERNEL); 402 if (!area) 403 goto err; 404 405 ret = io_import_area(ifq, &area->mem, area_reg); 406 if (ret) 407 goto err; 408 409 nr_iovs = area->mem.size >> PAGE_SHIFT; 410 area->nia.num_niovs = nr_iovs; 411 412 ret = -ENOMEM; 413 area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), 414 GFP_KERNEL | __GFP_ZERO); 415 if (!area->nia.niovs) 416 goto err; 417 418 area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), 419 GFP_KERNEL | __GFP_ZERO); 420 if (!area->freelist) 421 goto err; 422 423 area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), 424 GFP_KERNEL | __GFP_ZERO); 425 if (!area->user_refs) 426 goto err; 427 428 for (i = 0; i < nr_iovs; i++) { 429 struct net_iov *niov = &area->nia.niovs[i]; 430 431 niov->owner = &area->nia; 432 area->freelist[i] = i; 433 atomic_set(&area->user_refs[i], 0); 434 niov->type = NET_IOV_IOURING; 435 } 436 437 area->free_count = nr_iovs; 438 area->ifq = ifq; 439 /* we're only supporting one area per ifq for now */ 440 area->area_id = 0; 441 area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; 442 spin_lock_init(&area->freelist_lock); 443 *res = area; 444 return 0; 445 err: 446 if (area) 447 io_zcrx_free_area(area); 448 return ret; 449 } 450 451 static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) 452 { 453 struct io_zcrx_ifq *ifq; 454 455 ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); 456 if (!ifq) 457 return NULL; 458 459 ifq->if_rxq = -1; 460 ifq->ctx = ctx; 461 spin_lock_init(&ifq->lock); 462 spin_lock_init(&ifq->rq_lock); 463 mutex_init(&ifq->dma_lock); 464 return ifq; 465 } 466 467 static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) 468 { 469 spin_lock(&ifq->lock); 470 if (ifq->netdev) { 471 netdev_put(ifq->netdev, &ifq->netdev_tracker); 472 ifq->netdev = NULL; 473 } 474 spin_unlock(&ifq->lock); 475 } 476 477 static void io_close_queue(struct io_zcrx_ifq *ifq) 478 { 479 struct net_device *netdev; 480 netdevice_tracker netdev_tracker; 481 struct pp_memory_provider_params p = { 482 .mp_ops = &io_uring_pp_zc_ops, 483 .mp_priv = ifq, 484 }; 485 486 if (ifq->if_rxq == -1) 487 return; 488 489 spin_lock(&ifq->lock); 490 netdev = ifq->netdev; 491 netdev_tracker = ifq->netdev_tracker; 492 ifq->netdev = NULL; 493 spin_unlock(&ifq->lock); 494 495 if (netdev) { 496 net_mp_close_rxq(netdev, ifq->if_rxq, &p); 497 netdev_put(netdev, &netdev_tracker); 498 } 499 ifq->if_rxq = -1; 500 } 501 502 static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) 503 { 504 io_close_queue(ifq); 505 io_zcrx_drop_netdev(ifq); 506 507 if (ifq->area) 508 io_zcrx_free_area(ifq->area); 509 if (ifq->dev) 510 put_device(ifq->dev); 511 512 io_free_rbuf_ring(ifq); 513 mutex_destroy(&ifq->dma_lock); 514 kfree(ifq); 515 } 516 517 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, 518 unsigned int id) 519 { 520 struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); 521 522 lockdep_assert_held(&ctx->mmap_lock); 523 524 return ifq ? &ifq->region : NULL; 525 } 526 527 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 528 struct io_uring_zcrx_ifq_reg __user *arg) 529 { 530 struct pp_memory_provider_params mp_param = {}; 531 struct io_uring_zcrx_area_reg area; 532 struct io_uring_zcrx_ifq_reg reg; 533 struct io_uring_region_desc rd; 534 struct io_zcrx_ifq *ifq; 535 int ret; 536 u32 id; 537 538 /* 539 * 1. Interface queue allocation. 540 * 2. It can observe data destined for sockets of other tasks. 541 */ 542 if (!capable(CAP_NET_ADMIN)) 543 return -EPERM; 544 545 /* mandatory io_uring features for zc rx */ 546 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && 547 ctx->flags & IORING_SETUP_CQE32)) 548 return -EINVAL; 549 if (copy_from_user(®, arg, sizeof(reg))) 550 return -EFAULT; 551 if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 552 return -EFAULT; 553 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)) || 554 reg.__resv2 || reg.zcrx_id) 555 return -EINVAL; 556 if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) 557 return -EINVAL; 558 if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { 559 if (!(ctx->flags & IORING_SETUP_CLAMP)) 560 return -EINVAL; 561 reg.rq_entries = IO_RQ_MAX_ENTRIES; 562 } 563 reg.rq_entries = roundup_pow_of_two(reg.rq_entries); 564 565 if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) 566 return -EFAULT; 567 568 ifq = io_zcrx_ifq_alloc(ctx); 569 if (!ifq) 570 return -ENOMEM; 571 ifq->rq_entries = reg.rq_entries; 572 573 scoped_guard(mutex, &ctx->mmap_lock) { 574 /* preallocate id */ 575 ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); 576 if (ret) 577 goto ifq_free; 578 } 579 580 ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); 581 if (ret) 582 goto err; 583 584 ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, 585 &ifq->netdev_tracker, GFP_KERNEL); 586 if (!ifq->netdev) { 587 ret = -ENODEV; 588 goto err; 589 } 590 591 ifq->dev = ifq->netdev->dev.parent; 592 if (!ifq->dev) { 593 ret = -EOPNOTSUPP; 594 goto err; 595 } 596 get_device(ifq->dev); 597 598 ret = io_zcrx_create_area(ifq, &ifq->area, &area); 599 if (ret) 600 goto err; 601 602 mp_param.mp_ops = &io_uring_pp_zc_ops; 603 mp_param.mp_priv = ifq; 604 ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); 605 if (ret) 606 goto err; 607 ifq->if_rxq = reg.if_rxq; 608 609 reg.offsets.rqes = sizeof(struct io_uring); 610 reg.offsets.head = offsetof(struct io_uring, head); 611 reg.offsets.tail = offsetof(struct io_uring, tail); 612 reg.zcrx_id = id; 613 614 scoped_guard(mutex, &ctx->mmap_lock) { 615 /* publish ifq */ 616 ret = -ENOMEM; 617 if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) 618 goto err; 619 } 620 621 if (copy_to_user(arg, ®, sizeof(reg)) || 622 copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || 623 copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { 624 ret = -EFAULT; 625 goto err; 626 } 627 return 0; 628 err: 629 scoped_guard(mutex, &ctx->mmap_lock) 630 xa_erase(&ctx->zcrx_ctxs, id); 631 ifq_free: 632 io_zcrx_ifq_free(ifq); 633 return ret; 634 } 635 636 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) 637 { 638 struct io_zcrx_ifq *ifq; 639 640 lockdep_assert_held(&ctx->uring_lock); 641 642 while (1) { 643 scoped_guard(mutex, &ctx->mmap_lock) { 644 unsigned long id = 0; 645 646 ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); 647 if (ifq) 648 xa_erase(&ctx->zcrx_ctxs, id); 649 } 650 if (!ifq) 651 break; 652 io_zcrx_ifq_free(ifq); 653 } 654 655 xa_destroy(&ctx->zcrx_ctxs); 656 } 657 658 static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) 659 { 660 unsigned niov_idx; 661 662 lockdep_assert_held(&area->freelist_lock); 663 664 niov_idx = area->freelist[--area->free_count]; 665 return &area->nia.niovs[niov_idx]; 666 } 667 668 static void io_zcrx_return_niov_freelist(struct net_iov *niov) 669 { 670 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 671 672 spin_lock_bh(&area->freelist_lock); 673 area->freelist[area->free_count++] = net_iov_idx(niov); 674 spin_unlock_bh(&area->freelist_lock); 675 } 676 677 static void io_zcrx_return_niov(struct net_iov *niov) 678 { 679 netmem_ref netmem = net_iov_to_netmem(niov); 680 681 if (!niov->pp) { 682 /* copy fallback allocated niovs */ 683 io_zcrx_return_niov_freelist(niov); 684 return; 685 } 686 page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); 687 } 688 689 static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 690 { 691 struct io_zcrx_area *area = ifq->area; 692 int i; 693 694 if (!area) 695 return; 696 697 /* Reclaim back all buffers given to the user space. */ 698 for (i = 0; i < area->nia.num_niovs; i++) { 699 struct net_iov *niov = &area->nia.niovs[i]; 700 int nr; 701 702 if (!atomic_read(io_get_user_counter(niov))) 703 continue; 704 nr = atomic_xchg(io_get_user_counter(niov), 0); 705 if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 706 io_zcrx_return_niov(niov); 707 } 708 } 709 710 void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) 711 { 712 struct io_zcrx_ifq *ifq; 713 unsigned long index; 714 715 lockdep_assert_held(&ctx->uring_lock); 716 717 xa_for_each(&ctx->zcrx_ctxs, index, ifq) { 718 io_zcrx_scrub(ifq); 719 io_close_queue(ifq); 720 } 721 } 722 723 static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) 724 { 725 u32 entries; 726 727 entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; 728 return min(entries, ifq->rq_entries); 729 } 730 731 static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, 732 unsigned mask) 733 { 734 unsigned int idx = ifq->cached_rq_head++ & mask; 735 736 return &ifq->rqes[idx]; 737 } 738 739 static void io_zcrx_ring_refill(struct page_pool *pp, 740 struct io_zcrx_ifq *ifq) 741 { 742 unsigned int mask = ifq->rq_entries - 1; 743 unsigned int entries; 744 netmem_ref netmem; 745 746 spin_lock_bh(&ifq->rq_lock); 747 748 entries = io_zcrx_rqring_entries(ifq); 749 entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); 750 if (unlikely(!entries)) { 751 spin_unlock_bh(&ifq->rq_lock); 752 return; 753 } 754 755 do { 756 struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); 757 struct io_zcrx_area *area; 758 struct net_iov *niov; 759 unsigned niov_idx, area_idx; 760 761 area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; 762 niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; 763 764 if (unlikely(rqe->__pad || area_idx)) 765 continue; 766 area = ifq->area; 767 768 if (unlikely(niov_idx >= area->nia.num_niovs)) 769 continue; 770 niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); 771 772 niov = &area->nia.niovs[niov_idx]; 773 if (!io_zcrx_put_niov_uref(niov)) 774 continue; 775 776 netmem = net_iov_to_netmem(niov); 777 if (page_pool_unref_netmem(netmem, 1) != 0) 778 continue; 779 780 if (unlikely(niov->pp != pp)) { 781 io_zcrx_return_niov(niov); 782 continue; 783 } 784 785 io_zcrx_sync_for_device(pp, niov); 786 net_mp_netmem_place_in_cache(pp, netmem); 787 } while (--entries); 788 789 smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); 790 spin_unlock_bh(&ifq->rq_lock); 791 } 792 793 static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) 794 { 795 struct io_zcrx_area *area = ifq->area; 796 797 spin_lock_bh(&area->freelist_lock); 798 while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { 799 struct net_iov *niov = __io_zcrx_get_free_niov(area); 800 netmem_ref netmem = net_iov_to_netmem(niov); 801 802 net_mp_niov_set_page_pool(pp, niov); 803 io_zcrx_sync_for_device(pp, niov); 804 net_mp_netmem_place_in_cache(pp, netmem); 805 } 806 spin_unlock_bh(&area->freelist_lock); 807 } 808 809 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) 810 { 811 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 812 813 /* pp should already be ensuring that */ 814 if (unlikely(pp->alloc.count)) 815 goto out_return; 816 817 io_zcrx_ring_refill(pp, ifq); 818 if (likely(pp->alloc.count)) 819 goto out_return; 820 821 io_zcrx_refill_slow(pp, ifq); 822 if (!pp->alloc.count) 823 return 0; 824 out_return: 825 return pp->alloc.cache[--pp->alloc.count]; 826 } 827 828 static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) 829 { 830 struct net_iov *niov; 831 832 if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 833 return false; 834 835 niov = netmem_to_net_iov(netmem); 836 net_mp_niov_clear_page_pool(niov); 837 io_zcrx_return_niov_freelist(niov); 838 return false; 839 } 840 841 static int io_pp_zc_init(struct page_pool *pp) 842 { 843 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 844 int ret; 845 846 if (WARN_ON_ONCE(!ifq)) 847 return -EINVAL; 848 if (WARN_ON_ONCE(ifq->dev != pp->p.dev)) 849 return -EINVAL; 850 if (WARN_ON_ONCE(!pp->dma_map)) 851 return -EOPNOTSUPP; 852 if (pp->p.order != 0) 853 return -EOPNOTSUPP; 854 if (pp->p.dma_dir != DMA_FROM_DEVICE) 855 return -EOPNOTSUPP; 856 857 ret = io_zcrx_map_area(ifq, ifq->area); 858 if (ret) 859 return ret; 860 861 percpu_ref_get(&ifq->ctx->refs); 862 return 0; 863 } 864 865 static void io_pp_zc_destroy(struct page_pool *pp) 866 { 867 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 868 869 percpu_ref_put(&ifq->ctx->refs); 870 } 871 872 static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, 873 struct netdev_rx_queue *rxq) 874 { 875 struct nlattr *nest; 876 int type; 877 878 type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; 879 nest = nla_nest_start(rsp, type); 880 if (!nest) 881 return -EMSGSIZE; 882 nla_nest_end(rsp, nest); 883 884 return 0; 885 } 886 887 static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) 888 { 889 struct pp_memory_provider_params *p = &rxq->mp_params; 890 struct io_zcrx_ifq *ifq = mp_priv; 891 892 io_zcrx_drop_netdev(ifq); 893 if (ifq->area) 894 io_zcrx_unmap_area(ifq, ifq->area); 895 896 p->mp_ops = NULL; 897 p->mp_priv = NULL; 898 } 899 900 static const struct memory_provider_ops io_uring_pp_zc_ops = { 901 .alloc_netmems = io_pp_zc_alloc_netmems, 902 .release_netmem = io_pp_zc_release_netmem, 903 .init = io_pp_zc_init, 904 .destroy = io_pp_zc_destroy, 905 .nl_fill = io_pp_nl_fill, 906 .uninstall = io_pp_uninstall, 907 }; 908 909 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 910 struct io_zcrx_ifq *ifq, int off, int len) 911 { 912 struct io_uring_zcrx_cqe *rcqe; 913 struct io_zcrx_area *area; 914 struct io_uring_cqe *cqe; 915 u64 offset; 916 917 if (!io_defer_get_uncommited_cqe(req->ctx, &cqe)) 918 return false; 919 920 cqe->user_data = req->cqe.user_data; 921 cqe->res = len; 922 cqe->flags = IORING_CQE_F_MORE; 923 924 area = io_zcrx_iov_to_area(niov); 925 offset = off + (net_iov_idx(niov) << PAGE_SHIFT); 926 rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); 927 rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); 928 rcqe->__pad = 0; 929 return true; 930 } 931 932 static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) 933 { 934 struct net_iov *niov = NULL; 935 936 spin_lock_bh(&area->freelist_lock); 937 if (area->free_count) 938 niov = __io_zcrx_get_free_niov(area); 939 spin_unlock_bh(&area->freelist_lock); 940 941 if (niov) 942 page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); 943 return niov; 944 } 945 946 static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 947 void *src_base, struct page *src_page, 948 unsigned int src_offset, size_t len) 949 { 950 struct io_zcrx_area *area = ifq->area; 951 size_t copied = 0; 952 int ret = 0; 953 954 if (area->mem.is_dmabuf) 955 return -EFAULT; 956 957 while (len) { 958 size_t copy_size = min_t(size_t, PAGE_SIZE, len); 959 const int dst_off = 0; 960 struct net_iov *niov; 961 struct page *dst_page; 962 void *dst_addr; 963 964 niov = io_zcrx_alloc_fallback(area); 965 if (!niov) { 966 ret = -ENOMEM; 967 break; 968 } 969 970 dst_page = io_zcrx_iov_page(niov); 971 dst_addr = kmap_local_page(dst_page); 972 if (src_page) 973 src_base = kmap_local_page(src_page); 974 975 memcpy(dst_addr, src_base + src_offset, copy_size); 976 977 if (src_page) 978 kunmap_local(src_base); 979 kunmap_local(dst_addr); 980 981 if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) { 982 io_zcrx_return_niov(niov); 983 ret = -ENOSPC; 984 break; 985 } 986 987 io_zcrx_get_niov_uref(niov); 988 src_offset += copy_size; 989 len -= copy_size; 990 copied += copy_size; 991 } 992 993 return copied ? copied : ret; 994 } 995 996 static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 997 const skb_frag_t *frag, int off, int len) 998 { 999 struct page *page = skb_frag_page(frag); 1000 u32 p_off, p_len, t, copied = 0; 1001 int ret = 0; 1002 1003 off += skb_frag_off(frag); 1004 1005 skb_frag_foreach_page(frag, off, len, 1006 page, p_off, p_len, t) { 1007 ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len); 1008 if (ret < 0) 1009 return copied ? copied : ret; 1010 copied += ret; 1011 } 1012 return copied; 1013 } 1014 1015 static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1016 const skb_frag_t *frag, int off, int len) 1017 { 1018 struct net_iov *niov; 1019 1020 if (unlikely(!skb_frag_is_net_iov(frag))) 1021 return io_zcrx_copy_frag(req, ifq, frag, off, len); 1022 1023 niov = netmem_to_net_iov(frag->netmem); 1024 if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || 1025 io_pp_to_ifq(niov->pp) != ifq) 1026 return -EFAULT; 1027 1028 if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) 1029 return -ENOSPC; 1030 1031 /* 1032 * Prevent it from being recycled while user is accessing it. 1033 * It has to be done before grabbing a user reference. 1034 */ 1035 page_pool_ref_netmem(net_iov_to_netmem(niov)); 1036 io_zcrx_get_niov_uref(niov); 1037 return len; 1038 } 1039 1040 static int 1041 io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, 1042 unsigned int offset, size_t len) 1043 { 1044 struct io_zcrx_args *args = desc->arg.data; 1045 struct io_zcrx_ifq *ifq = args->ifq; 1046 struct io_kiocb *req = args->req; 1047 struct sk_buff *frag_iter; 1048 unsigned start, start_off = offset; 1049 int i, copy, end, off; 1050 int ret = 0; 1051 1052 len = min_t(size_t, len, desc->count); 1053 /* 1054 * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even 1055 * if desc->count is already 0. This is caused by the if (offset + 1 != 1056 * skb->len) check. Return early in this case to break out of 1057 * __tcp_read_sock(). 1058 */ 1059 if (!len) 1060 return 0; 1061 if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) 1062 return -EAGAIN; 1063 1064 if (unlikely(offset < skb_headlen(skb))) { 1065 ssize_t copied; 1066 size_t to_copy; 1067 1068 to_copy = min_t(size_t, skb_headlen(skb) - offset, len); 1069 copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL, 1070 offset, to_copy); 1071 if (copied < 0) { 1072 ret = copied; 1073 goto out; 1074 } 1075 offset += copied; 1076 len -= copied; 1077 if (!len) 1078 goto out; 1079 if (offset != skb_headlen(skb)) 1080 goto out; 1081 } 1082 1083 start = skb_headlen(skb); 1084 1085 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1086 const skb_frag_t *frag; 1087 1088 if (WARN_ON(start > offset + len)) 1089 return -EFAULT; 1090 1091 frag = &skb_shinfo(skb)->frags[i]; 1092 end = start + skb_frag_size(frag); 1093 1094 if (offset < end) { 1095 copy = end - offset; 1096 if (copy > len) 1097 copy = len; 1098 1099 off = offset - start; 1100 ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); 1101 if (ret < 0) 1102 goto out; 1103 1104 offset += ret; 1105 len -= ret; 1106 if (len == 0 || ret != copy) 1107 goto out; 1108 } 1109 start = end; 1110 } 1111 1112 skb_walk_frags(skb, frag_iter) { 1113 if (WARN_ON(start > offset + len)) 1114 return -EFAULT; 1115 1116 end = start + frag_iter->len; 1117 if (offset < end) { 1118 copy = end - offset; 1119 if (copy > len) 1120 copy = len; 1121 1122 off = offset - start; 1123 ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); 1124 if (ret < 0) 1125 goto out; 1126 1127 offset += ret; 1128 len -= ret; 1129 if (len == 0 || ret != copy) 1130 goto out; 1131 } 1132 start = end; 1133 } 1134 1135 out: 1136 if (offset == start_off) 1137 return ret; 1138 desc->count -= (offset - start_off); 1139 return offset - start_off; 1140 } 1141 1142 static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1143 struct sock *sk, int flags, 1144 unsigned issue_flags, unsigned int *outlen) 1145 { 1146 unsigned int len = *outlen; 1147 struct io_zcrx_args args = { 1148 .req = req, 1149 .ifq = ifq, 1150 .sock = sk->sk_socket, 1151 }; 1152 read_descriptor_t rd_desc = { 1153 .count = len ? len : UINT_MAX, 1154 .arg.data = &args, 1155 }; 1156 int ret; 1157 1158 lock_sock(sk); 1159 ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); 1160 if (len && ret > 0) 1161 *outlen = len - ret; 1162 if (ret <= 0) { 1163 if (ret < 0 || sock_flag(sk, SOCK_DONE)) 1164 goto out; 1165 if (sk->sk_err) 1166 ret = sock_error(sk); 1167 else if (sk->sk_shutdown & RCV_SHUTDOWN) 1168 goto out; 1169 else if (sk->sk_state == TCP_CLOSE) 1170 ret = -ENOTCONN; 1171 else 1172 ret = -EAGAIN; 1173 } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) && 1174 (issue_flags & IO_URING_F_MULTISHOT)) { 1175 ret = IOU_REQUEUE; 1176 } else if (sock_flag(sk, SOCK_DONE)) { 1177 /* Make it to retry until it finally gets 0. */ 1178 if (issue_flags & IO_URING_F_MULTISHOT) 1179 ret = IOU_REQUEUE; 1180 else 1181 ret = -EAGAIN; 1182 } 1183 out: 1184 release_sock(sk); 1185 return ret; 1186 } 1187 1188 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1189 struct socket *sock, unsigned int flags, 1190 unsigned issue_flags, unsigned int *len) 1191 { 1192 struct sock *sk = sock->sk; 1193 const struct proto *prot = READ_ONCE(sk->sk_prot); 1194 1195 if (prot->recvmsg != tcp_recvmsg) 1196 return -EPROTONOSUPPORT; 1197 1198 sock_rps_record_flow(sk); 1199 return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); 1200 } 1201