1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/dma-map-ops.h> 5 #include <linux/mm.h> 6 #include <linux/nospec.h> 7 #include <linux/io_uring.h> 8 #include <linux/netdevice.h> 9 #include <linux/rtnetlink.h> 10 #include <linux/skbuff_ref.h> 11 #include <linux/anon_inodes.h> 12 13 #include <net/page_pool/helpers.h> 14 #include <net/page_pool/memory_provider.h> 15 #include <net/netlink.h> 16 #include <net/netdev_queues.h> 17 #include <net/netdev_rx_queue.h> 18 #include <net/tcp.h> 19 #include <net/rps.h> 20 21 #include <trace/events/page_pool.h> 22 23 #include <uapi/linux/io_uring.h> 24 25 #include "io_uring.h" 26 #include "kbuf.h" 27 #include "memmap.h" 28 #include "zcrx.h" 29 #include "rsrc.h" 30 31 #define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) 32 33 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) 34 35 static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) 36 { 37 return pp->mp_priv; 38 } 39 40 static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) 41 { 42 struct net_iov_area *owner = net_iov_owner(niov); 43 44 return container_of(owner, struct io_zcrx_area, nia); 45 } 46 47 static bool zcrx_set_ring_ctx(struct io_zcrx_ifq *zcrx, 48 struct io_ring_ctx *ctx) 49 { 50 guard(spinlock_bh)(&zcrx->ctx_lock); 51 if (zcrx->master_ctx) 52 return false; 53 percpu_ref_get(&ctx->refs); 54 zcrx->master_ctx = ctx; 55 return true; 56 } 57 58 static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) 59 { 60 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 61 unsigned niov_pages_shift; 62 63 lockdep_assert(!area->mem.is_dmabuf); 64 65 niov_pages_shift = area->ifq->niov_shift - PAGE_SHIFT; 66 return area->mem.pages[net_iov_idx(niov) << niov_pages_shift]; 67 } 68 69 static int io_area_max_shift(struct io_zcrx_mem *mem) 70 { 71 struct sg_table *sgt = mem->sgt; 72 struct scatterlist *sg; 73 unsigned shift = -1U; 74 unsigned i; 75 76 for_each_sgtable_dma_sg(sgt, sg, i) 77 shift = min(shift, __ffs(sg_dma_len(sg))); 78 return shift; 79 } 80 81 static int io_populate_area_dma(struct io_zcrx_ifq *ifq, 82 struct io_zcrx_area *area) 83 { 84 unsigned niov_size = 1U << ifq->niov_shift; 85 struct sg_table *sgt = area->mem.sgt; 86 struct scatterlist *sg; 87 unsigned i, niov_idx = 0; 88 89 for_each_sgtable_dma_sg(sgt, sg, i) { 90 dma_addr_t dma = sg_dma_address(sg); 91 unsigned long sg_len = sg_dma_len(sg); 92 93 if (WARN_ON_ONCE(sg_len % niov_size)) 94 return -EINVAL; 95 96 while (sg_len && niov_idx < area->nia.num_niovs) { 97 struct net_iov *niov = &area->nia.niovs[niov_idx]; 98 99 if (net_mp_niov_set_dma_addr(niov, dma)) 100 return -EFAULT; 101 sg_len -= niov_size; 102 dma += niov_size; 103 niov_idx++; 104 } 105 } 106 107 if (WARN_ON_ONCE(niov_idx != area->nia.num_niovs)) 108 return -EFAULT; 109 return 0; 110 } 111 112 static void io_release_dmabuf(struct io_zcrx_mem *mem) 113 { 114 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 115 return; 116 117 if (mem->sgt) 118 dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, 119 DMA_FROM_DEVICE); 120 if (mem->attach) 121 dma_buf_detach(mem->dmabuf, mem->attach); 122 if (mem->dmabuf) 123 dma_buf_put(mem->dmabuf); 124 125 mem->sgt = NULL; 126 mem->attach = NULL; 127 mem->dmabuf = NULL; 128 } 129 130 static int io_import_dmabuf(struct io_zcrx_ifq *ifq, 131 struct io_zcrx_mem *mem, 132 struct io_uring_zcrx_area_reg *area_reg) 133 { 134 unsigned long off = (unsigned long)area_reg->addr; 135 unsigned long len = (unsigned long)area_reg->len; 136 unsigned long total_size = 0; 137 struct scatterlist *sg; 138 int dmabuf_fd = area_reg->dmabuf_fd; 139 int i, ret; 140 141 if (!ifq->dev) 142 return -EINVAL; 143 if (off) 144 return -EINVAL; 145 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 146 return -EINVAL; 147 148 mem->is_dmabuf = true; 149 mem->dmabuf = dma_buf_get(dmabuf_fd); 150 if (IS_ERR(mem->dmabuf)) { 151 ret = PTR_ERR(mem->dmabuf); 152 mem->dmabuf = NULL; 153 goto err; 154 } 155 156 mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); 157 if (IS_ERR(mem->attach)) { 158 ret = PTR_ERR(mem->attach); 159 mem->attach = NULL; 160 goto err; 161 } 162 163 mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); 164 if (IS_ERR(mem->sgt)) { 165 ret = PTR_ERR(mem->sgt); 166 mem->sgt = NULL; 167 goto err; 168 } 169 170 for_each_sgtable_dma_sg(mem->sgt, sg, i) 171 total_size += sg_dma_len(sg); 172 173 if (total_size != len) { 174 ret = -EINVAL; 175 goto err; 176 } 177 178 mem->size = len; 179 return 0; 180 err: 181 io_release_dmabuf(mem); 182 return ret; 183 } 184 185 static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages) 186 { 187 struct folio *last_folio = NULL; 188 unsigned long res = 0; 189 int i; 190 191 for (i = 0; i < nr_pages; i++) { 192 struct folio *folio = page_folio(pages[i]); 193 194 if (folio == last_folio) 195 continue; 196 last_folio = folio; 197 res += folio_nr_pages(folio); 198 } 199 return res; 200 } 201 202 static int io_import_umem(struct io_zcrx_ifq *ifq, 203 struct io_zcrx_mem *mem, 204 struct io_uring_zcrx_area_reg *area_reg) 205 { 206 struct page **pages; 207 int nr_pages, ret; 208 bool mapped = false; 209 210 if (area_reg->dmabuf_fd) 211 return -EINVAL; 212 if (!area_reg->addr) 213 return -EFAULT; 214 pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, 215 &nr_pages); 216 if (IS_ERR(pages)) 217 return PTR_ERR(pages); 218 219 ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages, 220 0, (unsigned long)nr_pages << PAGE_SHIFT, 221 GFP_KERNEL_ACCOUNT); 222 if (ret) 223 goto out_err; 224 225 if (ifq->dev) { 226 ret = dma_map_sgtable(ifq->dev, &mem->page_sg_table, 227 DMA_FROM_DEVICE, IO_DMA_ATTR); 228 if (ret < 0) 229 goto out_err; 230 mapped = true; 231 } 232 233 mem->account_pages = io_count_account_pages(pages, nr_pages); 234 ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages); 235 if (ret < 0) { 236 mem->account_pages = 0; 237 goto out_err; 238 } 239 240 mem->sgt = &mem->page_sg_table; 241 mem->pages = pages; 242 mem->nr_folios = nr_pages; 243 mem->size = area_reg->len; 244 return ret; 245 out_err: 246 if (mapped) 247 dma_unmap_sgtable(ifq->dev, &mem->page_sg_table, 248 DMA_FROM_DEVICE, IO_DMA_ATTR); 249 sg_free_table(&mem->page_sg_table); 250 unpin_user_pages(pages, nr_pages); 251 kvfree(pages); 252 return ret; 253 } 254 255 static void io_release_area_mem(struct io_zcrx_mem *mem) 256 { 257 if (mem->is_dmabuf) { 258 io_release_dmabuf(mem); 259 } else if (mem->pages) { 260 unpin_user_pages(mem->pages, mem->nr_folios); 261 sg_free_table(mem->sgt); 262 kvfree(mem->pages); 263 } 264 mem->pages = IO_URING_PTR_POISON; 265 mem->sgt = IO_URING_PTR_POISON; 266 } 267 268 static int io_import_area(struct io_zcrx_ifq *ifq, 269 struct io_zcrx_mem *mem, 270 struct io_uring_zcrx_area_reg *area_reg) 271 { 272 int ret; 273 274 if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) 275 return -EINVAL; 276 if (area_reg->rq_area_token) 277 return -EINVAL; 278 if (area_reg->__resv2[0] || area_reg->__resv2[1]) 279 return -EINVAL; 280 281 ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); 282 if (ret) 283 return ret; 284 if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) 285 return -EINVAL; 286 287 if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) 288 return io_import_dmabuf(ifq, mem, area_reg); 289 return io_import_umem(ifq, mem, area_reg); 290 } 291 292 static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, 293 struct io_zcrx_area *area) 294 { 295 int i; 296 297 guard(mutex)(&ifq->pp_lock); 298 if (!area->is_mapped) 299 return; 300 area->is_mapped = false; 301 302 if (area->nia.niovs) { 303 for (i = 0; i < area->nia.num_niovs; i++) 304 net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); 305 } 306 307 if (area->mem.is_dmabuf) { 308 io_release_dmabuf(&area->mem); 309 } else { 310 dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table, 311 DMA_FROM_DEVICE, IO_DMA_ATTR); 312 } 313 } 314 315 static void zcrx_sync_for_device(struct page_pool *pp, struct io_zcrx_ifq *zcrx, 316 netmem_ref *netmems, unsigned nr) 317 { 318 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 319 struct device *dev = pp->p.dev; 320 unsigned i, niov_size; 321 dma_addr_t dma_addr; 322 323 if (!dma_dev_need_sync(dev)) 324 return; 325 niov_size = 1U << zcrx->niov_shift; 326 327 for (i = 0; i < nr; i++) { 328 dma_addr = page_pool_get_dma_addr_netmem(netmems[i]); 329 __dma_sync_single_for_device(dev, dma_addr + pp->p.offset, 330 niov_size, pp->p.dma_dir); 331 } 332 #endif 333 } 334 335 #define IO_RQ_MAX_ENTRIES 32768 336 337 #define IO_SKBS_PER_CALL_LIMIT 20 338 339 struct io_zcrx_args { 340 struct io_kiocb *req; 341 struct io_zcrx_ifq *ifq; 342 unsigned nr_skbs; 343 }; 344 345 static const struct memory_provider_ops io_uring_pp_zc_ops; 346 347 static inline atomic_t *io_get_user_counter(struct net_iov *niov) 348 { 349 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 350 351 return &area->user_refs[net_iov_idx(niov)]; 352 } 353 354 static bool io_zcrx_put_niov_uref(struct net_iov *niov) 355 { 356 atomic_t *uref = io_get_user_counter(niov); 357 int old; 358 359 old = atomic_read(uref); 360 do { 361 if (unlikely(old == 0)) 362 return false; 363 } while (!atomic_try_cmpxchg(uref, &old, old - 1)); 364 365 return true; 366 } 367 368 static void io_zcrx_get_niov_uref(struct net_iov *niov) 369 { 370 atomic_inc(io_get_user_counter(niov)); 371 } 372 373 static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets) 374 { 375 offsets->head = offsetof(struct io_uring, head); 376 offsets->tail = offsetof(struct io_uring, tail); 377 offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); 378 } 379 380 static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, 381 struct io_zcrx_ifq *ifq, 382 struct io_uring_zcrx_ifq_reg *reg, 383 struct io_uring_region_desc *rd, 384 u32 id) 385 { 386 u64 mmap_offset; 387 size_t off, size; 388 void *ptr; 389 int ret; 390 391 io_fill_zcrx_offsets(®->offsets); 392 off = reg->offsets.rqes; 393 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 394 if (size > rd->size) 395 return -EINVAL; 396 397 mmap_offset = IORING_MAP_OFF_ZCRX_REGION; 398 mmap_offset += (u64)id << IORING_OFF_ZCRX_SHIFT; 399 400 ret = io_create_region(ctx, &ifq->rq_region, rd, mmap_offset); 401 if (ret < 0) 402 return ret; 403 404 ptr = io_region_get_ptr(&ifq->rq_region); 405 ifq->rq.ring = (struct io_uring *)ptr; 406 ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 407 408 memset(ifq->rq.ring, 0, sizeof(*ifq->rq.ring)); 409 return 0; 410 } 411 412 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) 413 { 414 io_free_region(ifq->user, &ifq->rq_region); 415 ifq->rq.ring = IO_URING_PTR_POISON; 416 ifq->rq.rqes = IO_URING_PTR_POISON; 417 ifq->notif_stats = IO_URING_PTR_POISON; 418 } 419 420 static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, 421 struct io_zcrx_area *area) 422 { 423 io_zcrx_unmap_area(ifq, area); 424 io_release_area_mem(&area->mem); 425 426 if (area->mem.account_pages) 427 io_unaccount_mem(ifq->user, ifq->mm_account, 428 area->mem.account_pages); 429 430 kvfree(area->freelist); 431 kvfree(area->nia.niovs); 432 kvfree(area->user_refs); 433 kfree(area); 434 } 435 436 static int io_zcrx_append_area(struct io_zcrx_ifq *ifq, 437 struct io_zcrx_area *area) 438 { 439 bool kern_readable = !area->mem.is_dmabuf; 440 441 if (WARN_ON_ONCE(ifq->area)) 442 return -EINVAL; 443 if (WARN_ON_ONCE(ifq->kern_readable != kern_readable)) 444 return -EINVAL; 445 446 ifq->area = area; 447 return 0; 448 } 449 450 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, 451 struct io_uring_zcrx_area_reg *area_reg, 452 struct io_uring_zcrx_ifq_reg *reg) 453 { 454 int buf_size_shift = PAGE_SHIFT; 455 struct io_zcrx_area *area; 456 unsigned nr_iovs; 457 int i, ret; 458 459 if (reg->rx_buf_len) { 460 if (!is_power_of_2(reg->rx_buf_len) || 461 reg->rx_buf_len < PAGE_SIZE) 462 return -EINVAL; 463 buf_size_shift = ilog2(reg->rx_buf_len); 464 } 465 if (!ifq->dev && buf_size_shift != PAGE_SHIFT) 466 return -EOPNOTSUPP; 467 468 ret = -ENOMEM; 469 area = kzalloc_obj(*area); 470 if (!area) 471 goto err; 472 area->ifq = ifq; 473 474 ret = io_import_area(ifq, &area->mem, area_reg); 475 if (ret) 476 goto err; 477 if (ifq->dev) 478 area->is_mapped = true; 479 480 if (ifq->dev && buf_size_shift > io_area_max_shift(&area->mem)) { 481 ret = -ERANGE; 482 goto err; 483 } 484 485 ifq->niov_shift = buf_size_shift; 486 nr_iovs = area->mem.size >> ifq->niov_shift; 487 area->nia.num_niovs = nr_iovs; 488 489 ret = -ENOMEM; 490 area->nia.niovs = kvmalloc_objs(area->nia.niovs[0], nr_iovs, 491 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 492 if (!area->nia.niovs) 493 goto err; 494 495 area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), 496 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 497 if (!area->freelist) 498 goto err; 499 500 area->user_refs = kvmalloc_objs(area->user_refs[0], nr_iovs, 501 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 502 if (!area->user_refs) 503 goto err; 504 505 for (i = 0; i < nr_iovs; i++) { 506 struct net_iov *niov = &area->nia.niovs[i]; 507 508 net_iov_init(niov, &area->nia, NET_IOV_IOURING); 509 area->freelist[i] = i; 510 atomic_set(&area->user_refs[i], 0); 511 } 512 513 if (ifq->dev) { 514 ret = io_populate_area_dma(ifq, area); 515 if (ret) 516 goto err; 517 } 518 519 area->free_count = nr_iovs; 520 /* we're only supporting one area per ifq for now */ 521 area->area_id = 0; 522 area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; 523 spin_lock_init(&area->freelist_lock); 524 525 ret = io_zcrx_append_area(ifq, area); 526 if (!ret) 527 return 0; 528 err: 529 if (area) 530 io_zcrx_free_area(ifq, area); 531 return ret; 532 } 533 534 static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) 535 { 536 struct io_zcrx_ifq *ifq; 537 538 ifq = kzalloc_obj(*ifq); 539 if (!ifq) 540 return NULL; 541 542 ifq->if_rxq = -1; 543 spin_lock_init(&ifq->ctx_lock); 544 spin_lock_init(&ifq->rq.lock); 545 mutex_init(&ifq->pp_lock); 546 refcount_set(&ifq->refs, 1); 547 refcount_set(&ifq->user_refs, 1); 548 return ifq; 549 } 550 551 static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) 552 { 553 guard(mutex)(&ifq->pp_lock); 554 555 if (!ifq->netdev) 556 return; 557 netdev_put(ifq->netdev, &ifq->netdev_tracker); 558 ifq->netdev = NULL; 559 } 560 561 static void io_close_queue(struct io_zcrx_ifq *ifq) 562 { 563 struct net_device *netdev; 564 netdevice_tracker netdev_tracker; 565 struct pp_memory_provider_params p = { 566 .mp_ops = &io_uring_pp_zc_ops, 567 .mp_priv = ifq, 568 }; 569 570 scoped_guard(mutex, &ifq->pp_lock) { 571 netdev = ifq->netdev; 572 netdev_tracker = ifq->netdev_tracker; 573 ifq->netdev = NULL; 574 } 575 576 if (netdev) { 577 if (ifq->if_rxq != -1) { 578 netdev_lock(netdev); 579 netif_mp_close_rxq(netdev, ifq->if_rxq, &p); 580 netdev_unlock(netdev); 581 } 582 netdev_put(netdev, &netdev_tracker); 583 } 584 ifq->if_rxq = -1; 585 } 586 587 static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) 588 { 589 if (WARN_ON_ONCE(ifq->if_rxq != -1)) 590 return; 591 if (WARN_ON_ONCE(ifq->netdev != NULL)) 592 return; 593 if (WARN_ON_ONCE(ifq->master_ctx)) 594 return; 595 596 if (ifq->area) 597 io_zcrx_free_area(ifq, ifq->area); 598 if (ifq->mm_account) 599 mmdrop(ifq->mm_account); 600 if (ifq->dev) 601 put_device(ifq->dev); 602 603 io_free_rbuf_ring(ifq); 604 free_uid(ifq->user); 605 mutex_destroy(&ifq->pp_lock); 606 kfree(ifq); 607 } 608 609 static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq) 610 { 611 if (refcount_dec_and_test(&ifq->refs)) 612 io_zcrx_ifq_free(ifq); 613 } 614 615 static void io_zcrx_return_niov_freelist(struct net_iov *niov) 616 { 617 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 618 619 guard(spinlock_bh)(&area->freelist_lock); 620 if (WARN_ON_ONCE(area->free_count >= area->nia.num_niovs)) 621 return; 622 area->freelist[area->free_count++] = net_iov_idx(niov); 623 } 624 625 static struct net_iov *zcrx_get_free_niov(struct io_zcrx_area *area) 626 { 627 unsigned niov_idx; 628 629 lockdep_assert_held(&area->freelist_lock); 630 631 if (unlikely(!area->free_count)) 632 return NULL; 633 634 niov_idx = area->freelist[--area->free_count]; 635 return &area->nia.niovs[niov_idx]; 636 } 637 638 static void io_zcrx_return_niov(struct net_iov *niov) 639 { 640 netmem_ref netmem = net_iov_to_netmem(niov); 641 642 if (!niov->desc.pp) { 643 /* copy fallback allocated niovs */ 644 io_zcrx_return_niov_freelist(niov); 645 return; 646 } 647 page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); 648 } 649 650 static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 651 { 652 struct io_zcrx_area *area = ifq->area; 653 int i; 654 655 if (!area) 656 return; 657 658 /* Reclaim back all buffers given to the user space. */ 659 for (i = 0; i < area->nia.num_niovs; i++) { 660 struct net_iov *niov = &area->nia.niovs[i]; 661 int nr; 662 663 if (!atomic_read(io_get_user_counter(niov))) 664 continue; 665 nr = atomic_xchg(io_get_user_counter(niov), 0); 666 if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 667 io_zcrx_return_niov(niov); 668 } 669 } 670 671 static void zcrx_unregister_user(struct io_zcrx_ifq *ifq, struct io_ring_ctx *ctx) 672 { 673 scoped_guard(spinlock_bh, &ifq->ctx_lock) { 674 if (ctx && ifq->master_ctx == ctx) { 675 ifq->master_ctx = NULL; 676 percpu_ref_put(&ctx->refs); 677 } 678 } 679 680 if (refcount_dec_and_test(&ifq->user_refs)) { 681 io_close_queue(ifq); 682 io_zcrx_scrub(ifq); 683 } 684 } 685 686 static void zcrx_unregister(struct io_zcrx_ifq *ifq, struct io_ring_ctx *ctx) 687 { 688 zcrx_unregister_user(ifq, ctx); 689 io_put_zcrx_ifq(ifq); 690 } 691 692 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, 693 unsigned int id) 694 { 695 struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); 696 697 lockdep_assert_held(&ctx->mmap_lock); 698 699 return ifq ? &ifq->rq_region : NULL; 700 } 701 702 static int zcrx_box_release(struct inode *inode, struct file *file) 703 { 704 struct io_zcrx_ifq *ifq = file->private_data; 705 706 if (WARN_ON_ONCE(!ifq)) 707 return -EFAULT; 708 zcrx_unregister(ifq, NULL); 709 return 0; 710 } 711 712 static const struct file_operations zcrx_box_fops = { 713 .owner = THIS_MODULE, 714 .release = zcrx_box_release, 715 }; 716 717 static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, 718 struct zcrx_ctrl *ctrl, void __user *arg) 719 { 720 struct zcrx_ctrl_export *ce = &ctrl->zc_export; 721 struct file *file; 722 int fd; 723 724 if (!mem_is_zero(ce, sizeof(*ce))) 725 return -EINVAL; 726 727 refcount_inc(&ifq->refs); 728 refcount_inc(&ifq->user_refs); 729 730 file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops, 731 ifq, O_CLOEXEC, NULL); 732 if (IS_ERR(file)) { 733 zcrx_unregister(ifq, NULL); 734 return PTR_ERR(file); 735 } 736 737 fd = get_unused_fd_flags(O_CLOEXEC); 738 if (fd < 0) { 739 fput(file); 740 return fd; 741 } 742 743 ce->zcrx_fd = fd; 744 if (copy_to_user(arg, ctrl, sizeof(*ctrl))) { 745 fput(file); 746 put_unused_fd(fd); 747 return -EFAULT; 748 } 749 750 fd_install(fd, file); 751 return 0; 752 } 753 754 static int import_zcrx(struct io_ring_ctx *ctx, 755 struct io_uring_zcrx_ifq_reg __user *arg, 756 struct io_uring_zcrx_ifq_reg *reg) 757 { 758 struct io_zcrx_ifq *ifq; 759 struct file *file; 760 int fd, ret; 761 u32 id; 762 763 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 764 return -EINVAL; 765 if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) 766 return -EINVAL; 767 if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) 768 return -EINVAL; 769 if (reg->notif_desc) 770 return -EINVAL; 771 if (reg->flags & ~ZCRX_REG_IMPORT) 772 return -EINVAL; 773 774 fd = reg->if_idx; 775 CLASS(fd, f)(fd); 776 if (fd_empty(f)) 777 return -EBADF; 778 779 file = fd_file(f); 780 if (file->f_op != &zcrx_box_fops || !file->private_data) 781 return -EBADF; 782 783 ifq = file->private_data; 784 refcount_inc(&ifq->refs); 785 refcount_inc(&ifq->user_refs); 786 787 scoped_guard(mutex, &ctx->mmap_lock) { 788 ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); 789 if (ret) 790 goto err; 791 } 792 793 reg->zcrx_id = id; 794 io_fill_zcrx_offsets(®->offsets); 795 if (copy_to_user(arg, reg, sizeof(*reg))) { 796 ret = -EFAULT; 797 goto err_xa_erase; 798 } 799 800 scoped_guard(mutex, &ctx->mmap_lock) { 801 ret = -ENOMEM; 802 if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) 803 goto err_xa_erase; 804 } 805 806 return 0; 807 err_xa_erase: 808 scoped_guard(mutex, &ctx->mmap_lock) 809 xa_erase(&ctx->zcrx_ctxs, id); 810 err: 811 zcrx_unregister(ifq, ctx); 812 return ret; 813 } 814 815 static int zcrx_register_netdev(struct io_zcrx_ifq *ifq, 816 struct io_uring_zcrx_ifq_reg *reg, 817 struct io_uring_zcrx_area_reg *area) 818 { 819 struct pp_memory_provider_params mp_param = {}; 820 unsigned if_rxq = reg->if_rxq; 821 int ret; 822 823 ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, 824 reg->if_idx); 825 if (!ifq->netdev) 826 return -ENODEV; 827 828 netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); 829 830 ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, if_rxq, NETDEV_QUEUE_TYPE_RX); 831 if (!ifq->dev) { 832 ret = -EOPNOTSUPP; 833 goto netdev_put_unlock; 834 } 835 get_device(ifq->dev); 836 837 ret = io_zcrx_create_area(ifq, area, reg); 838 if (ret) 839 goto netdev_put_unlock; 840 841 if (reg->rx_buf_len) 842 mp_param.rx_page_size = 1U << ifq->niov_shift; 843 mp_param.mp_ops = &io_uring_pp_zc_ops; 844 mp_param.mp_priv = ifq; 845 ret = netif_mp_open_rxq(ifq->netdev, if_rxq, &mp_param, NULL); 846 if (ret) 847 goto netdev_put_unlock; 848 849 ifq->if_rxq = if_rxq; 850 ret = 0; 851 netdev_put_unlock: 852 netdev_unlock(ifq->netdev); 853 return ret; 854 } 855 856 static int zcrx_validate_notif_stats(struct io_zcrx_ifq *ifq, 857 const struct io_uring_zcrx_ifq_reg *reg, 858 const struct zcrx_notification_desc *notif) 859 { 860 size_t stats_off = notif->stats_offset; 861 size_t used, end; 862 863 used = reg->offsets.rqes + 864 sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 865 866 if (!IS_ALIGNED(stats_off, __alignof__(struct zcrx_notif_stats))) 867 return -EINVAL; 868 if (stats_off < used) 869 return -ERANGE; 870 if (check_add_overflow(stats_off, 871 sizeof(struct zcrx_notif_stats), 872 &end)) 873 return -ERANGE; 874 if (end > io_region_size(&ifq->rq_region)) 875 return -ERANGE; 876 877 ifq->notif_stats = io_region_get_ptr(&ifq->rq_region) + stats_off; 878 memset(ifq->notif_stats, 0, sizeof(*ifq->notif_stats)); 879 880 return 0; 881 } 882 883 int io_register_zcrx(struct io_ring_ctx *ctx, 884 struct io_uring_zcrx_ifq_reg __user *arg) 885 { 886 struct zcrx_notification_desc notif; 887 struct io_uring_zcrx_area_reg area; 888 struct io_uring_zcrx_ifq_reg reg; 889 struct io_uring_region_desc rd; 890 struct io_zcrx_ifq *ifq; 891 int ret; 892 u32 id; 893 894 /* 895 * 1. Interface queue allocation. 896 * 2. It can observe data destined for sockets of other tasks. 897 */ 898 if (!capable(CAP_NET_ADMIN)) 899 return -EPERM; 900 901 /* mandatory io_uring features for zc rx */ 902 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 903 return -EINVAL; 904 if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) 905 return -EINVAL; 906 if (copy_from_user(®, arg, sizeof(reg))) 907 return -EFAULT; 908 if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.zcrx_id) 909 return -EINVAL; 910 if (reg.flags & ~ZCRX_SUPPORTED_REG_FLAGS) 911 return -EINVAL; 912 if (reg.flags & ZCRX_REG_IMPORT) 913 return import_zcrx(ctx, arg, ®); 914 if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 915 return -EFAULT; 916 if (reg.if_rxq == -1 || !reg.rq_entries) 917 return -EINVAL; 918 if ((reg.if_rxq || reg.if_idx) && (reg.flags & ZCRX_REG_NODEV)) 919 return -EINVAL; 920 if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { 921 if (!(ctx->flags & IORING_SETUP_CLAMP)) 922 return -EINVAL; 923 reg.rq_entries = IO_RQ_MAX_ENTRIES; 924 } 925 reg.rq_entries = roundup_pow_of_two(reg.rq_entries); 926 927 if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) 928 return -EFAULT; 929 930 memset(¬if, 0, sizeof(notif)); 931 if (reg.notif_desc && copy_from_user(¬if, u64_to_user_ptr(reg.notif_desc), 932 sizeof(notif))) 933 return -EFAULT; 934 if (notif.type_mask & ~ZCRX_NOTIF_TYPE_MASK) 935 return -EINVAL; 936 if (notif.flags & ~ZCRX_NOTIF_DESC_FLAG_STATS) 937 return -EINVAL; 938 if (!(notif.flags & ZCRX_NOTIF_DESC_FLAG_STATS)) { 939 if (notif.stats_offset) 940 return -EINVAL; 941 } 942 if (!mem_is_zero(¬if.__resv2, sizeof(notif.__resv2))) 943 return -EINVAL; 944 945 ifq = io_zcrx_ifq_alloc(ctx); 946 if (!ifq) 947 return -ENOMEM; 948 949 ifq->notif_data = notif.user_data; 950 ifq->allowed_notif_mask = notif.type_mask; 951 952 if (ctx->user) { 953 get_uid(ctx->user); 954 ifq->user = ctx->user; 955 } 956 if (ctx->mm_account) { 957 mmgrab(ctx->mm_account); 958 ifq->mm_account = ctx->mm_account; 959 } 960 ifq->rq.nr_entries = reg.rq_entries; 961 962 scoped_guard(mutex, &ctx->mmap_lock) { 963 /* preallocate id */ 964 ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); 965 if (ret) 966 goto ifq_free; 967 } 968 969 ret = io_allocate_rbuf_ring(ctx, ifq, ®, &rd, id); 970 if (ret) 971 goto err; 972 973 if (notif.flags & ZCRX_NOTIF_DESC_FLAG_STATS) { 974 ret = zcrx_validate_notif_stats(ifq, ®, ¬if); 975 if (ret) 976 goto err; 977 } 978 979 ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF); 980 981 if (!(reg.flags & ZCRX_REG_NODEV)) { 982 ret = zcrx_register_netdev(ifq, ®, &area); 983 if (ret) 984 goto err; 985 } else { 986 ret = io_zcrx_create_area(ifq, &area, ®); 987 if (ret) 988 goto err; 989 } 990 991 reg.zcrx_id = id; 992 993 scoped_guard(mutex, &ctx->mmap_lock) { 994 /* publish ifq */ 995 ret = -ENOMEM; 996 if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) 997 goto err; 998 } 999 1000 reg.rx_buf_len = 1U << ifq->niov_shift; 1001 1002 if (copy_to_user(arg, ®, sizeof(reg)) || 1003 copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || 1004 copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { 1005 ret = -EFAULT; 1006 goto err; 1007 } 1008 1009 if (notif.type_mask) 1010 zcrx_set_ring_ctx(ifq, ctx); 1011 return 0; 1012 err: 1013 scoped_guard(mutex, &ctx->mmap_lock) 1014 xa_erase(&ctx->zcrx_ctxs, id); 1015 ifq_free: 1016 zcrx_unregister(ifq, ctx); 1017 return ret; 1018 } 1019 1020 static inline bool is_zcrx_entry_marked(struct io_ring_ctx *ctx, unsigned long id) 1021 { 1022 return xa_get_mark(&ctx->zcrx_ctxs, id, XA_MARK_1); 1023 } 1024 1025 static inline void set_zcrx_entry_mark(struct io_ring_ctx *ctx, unsigned long id) 1026 { 1027 xa_set_mark(&ctx->zcrx_ctxs, id, XA_MARK_1); 1028 } 1029 1030 void io_terminate_zcrx(struct io_ring_ctx *ctx) 1031 { 1032 struct io_zcrx_ifq *ifq; 1033 unsigned long id = 0; 1034 1035 lockdep_assert_held(&ctx->uring_lock); 1036 1037 while (1) { 1038 scoped_guard(mutex, &ctx->mmap_lock) 1039 ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); 1040 if (!ifq) 1041 break; 1042 if (WARN_ON_ONCE(is_zcrx_entry_marked(ctx, id))) 1043 break; 1044 set_zcrx_entry_mark(ctx, id); 1045 id++; 1046 zcrx_unregister_user(ifq, ctx); 1047 } 1048 } 1049 1050 void io_unregister_zcrx(struct io_ring_ctx *ctx) 1051 { 1052 struct io_zcrx_ifq *ifq; 1053 1054 lockdep_assert_held(&ctx->uring_lock); 1055 1056 while (1) { 1057 scoped_guard(mutex, &ctx->mmap_lock) { 1058 unsigned long id = 0; 1059 1060 ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); 1061 if (ifq) { 1062 if (WARN_ON_ONCE(!is_zcrx_entry_marked(ctx, id))) { 1063 ifq = NULL; 1064 break; 1065 } 1066 xa_erase(&ctx->zcrx_ctxs, id); 1067 } 1068 } 1069 if (!ifq) 1070 break; 1071 /* 1072 * io_uring can run requests and return buffers to the user 1073 * after termination, scrub it again. 1074 */ 1075 if (refcount_read(&ifq->user_refs) == 0) 1076 io_zcrx_scrub(ifq); 1077 io_put_zcrx_ifq(ifq); 1078 } 1079 1080 xa_destroy(&ctx->zcrx_ctxs); 1081 } 1082 1083 static inline u32 zcrx_rq_entries(struct zcrx_rq *rq) 1084 { 1085 u32 entries; 1086 1087 entries = smp_load_acquire(&rq->ring->tail) - rq->cached_head; 1088 return min(entries, rq->nr_entries); 1089 } 1090 1091 static struct io_uring_zcrx_rqe *zcrx_next_rqe(struct zcrx_rq *rq, unsigned mask) 1092 { 1093 unsigned int idx = rq->cached_head++ & mask; 1094 1095 return &rq->rqes[idx]; 1096 } 1097 1098 static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, 1099 struct io_zcrx_ifq *ifq, 1100 struct net_iov **ret_niov) 1101 { 1102 __u64 off = READ_ONCE(rqe->off); 1103 unsigned niov_idx, area_idx; 1104 struct io_zcrx_area *area; 1105 1106 area_idx = off >> IORING_ZCRX_AREA_SHIFT; 1107 niov_idx = (off & ~IORING_ZCRX_AREA_MASK) >> ifq->niov_shift; 1108 1109 if (unlikely(rqe->__pad || area_idx)) 1110 return false; 1111 area = ifq->area; 1112 1113 if (unlikely(niov_idx >= area->nia.num_niovs)) 1114 return false; 1115 niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); 1116 1117 *ret_niov = &area->nia.niovs[niov_idx]; 1118 return true; 1119 } 1120 1121 static unsigned io_zcrx_ring_refill(struct page_pool *pp, 1122 struct io_zcrx_ifq *ifq, 1123 netmem_ref *netmems, unsigned to_alloc) 1124 { 1125 struct zcrx_rq *rq = &ifq->rq; 1126 unsigned int mask = rq->nr_entries - 1; 1127 unsigned int entries; 1128 unsigned allocated = 0; 1129 1130 guard(spinlock_bh)(&rq->lock); 1131 1132 entries = zcrx_rq_entries(rq); 1133 entries = min_t(unsigned, entries, to_alloc); 1134 if (unlikely(!entries)) 1135 return 0; 1136 1137 do { 1138 struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask); 1139 struct net_iov *niov; 1140 netmem_ref netmem; 1141 1142 if (!io_parse_rqe(rqe, ifq, &niov)) 1143 continue; 1144 if (!io_zcrx_put_niov_uref(niov)) 1145 continue; 1146 1147 netmem = net_iov_to_netmem(niov); 1148 if (!page_pool_unref_and_test(netmem)) 1149 continue; 1150 1151 if (unlikely(niov->desc.pp != pp)) { 1152 io_zcrx_return_niov(niov); 1153 continue; 1154 } 1155 1156 netmems[allocated] = netmem; 1157 allocated++; 1158 } while (--entries); 1159 1160 smp_store_release(&rq->ring->head, rq->cached_head); 1161 return allocated; 1162 } 1163 1164 static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq, 1165 netmem_ref *netmems, unsigned to_alloc) 1166 { 1167 struct io_zcrx_area *area = ifq->area; 1168 unsigned allocated = 0; 1169 1170 guard(spinlock_bh)(&area->freelist_lock); 1171 1172 for (allocated = 0; allocated < to_alloc; allocated++) { 1173 struct net_iov *niov = zcrx_get_free_niov(area); 1174 1175 if (!niov) 1176 break; 1177 net_mp_niov_set_page_pool(pp, niov); 1178 netmems[allocated] = net_iov_to_netmem(niov); 1179 } 1180 return allocated; 1181 } 1182 1183 static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw) 1184 { 1185 struct io_kiocb *req = tw_req.req; 1186 struct io_ring_ctx *ctx = req->ctx; 1187 1188 io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0); 1189 percpu_ref_put(&ctx->refs); 1190 io_poison_req(req); 1191 kmem_cache_free(req_cachep, req); 1192 } 1193 1194 static void zcrx_stat_add(__u64 *p, s64 v) 1195 { 1196 WRITE_ONCE(*p, READ_ONCE(*p) + v); 1197 } 1198 1199 static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type) 1200 { 1201 gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO; 1202 u32 type_mask = 1 << type; 1203 struct io_kiocb *req; 1204 1205 if (!(type_mask & ifq->allowed_notif_mask)) 1206 return; 1207 1208 guard(spinlock_bh)(&ifq->ctx_lock); 1209 if (!ifq->master_ctx) 1210 return; 1211 if (type_mask & ifq->fired_notifs) 1212 return; 1213 1214 req = kmem_cache_alloc(req_cachep, gfp); 1215 if (unlikely(!req)) 1216 return; 1217 1218 ifq->fired_notifs |= type_mask; 1219 1220 req->opcode = IORING_OP_NOP; 1221 req->cqe.user_data = ifq->notif_data; 1222 req->cqe.res = type; 1223 req->ctx = ifq->master_ctx; 1224 percpu_ref_get(&req->ctx->refs); 1225 req->tctx = NULL; 1226 req->io_task_work.func = zcrx_notif_tw; 1227 io_req_task_work_add(req); 1228 } 1229 1230 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) 1231 { 1232 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 1233 netmem_ref *netmems = pp->alloc.cache; 1234 unsigned to_alloc = PP_ALLOC_CACHE_REFILL; 1235 unsigned allocated; 1236 1237 /* pp should already be ensuring that */ 1238 if (WARN_ON_ONCE(pp->alloc.count)) 1239 return 0; 1240 1241 allocated = io_zcrx_ring_refill(pp, ifq, netmems, to_alloc); 1242 if (likely(allocated)) 1243 goto out_return; 1244 1245 allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc); 1246 if (!allocated) { 1247 zcrx_send_notif(ifq, ZCRX_NOTIF_NO_BUFFERS); 1248 return 0; 1249 } 1250 out_return: 1251 zcrx_sync_for_device(pp, ifq, netmems, allocated); 1252 allocated--; 1253 pp->alloc.count += allocated; 1254 return netmems[allocated]; 1255 } 1256 1257 static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) 1258 { 1259 struct net_iov *niov; 1260 1261 if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 1262 return false; 1263 1264 niov = netmem_to_net_iov(netmem); 1265 net_mp_niov_clear_page_pool(niov); 1266 io_zcrx_return_niov_freelist(niov); 1267 return false; 1268 } 1269 1270 static int io_pp_zc_init(struct page_pool *pp) 1271 { 1272 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 1273 1274 if (WARN_ON_ONCE(!ifq)) 1275 return -EINVAL; 1276 if (WARN_ON_ONCE(ifq->dev != pp->p.dev)) 1277 return -EINVAL; 1278 if (WARN_ON_ONCE(!pp->dma_map)) 1279 return -EOPNOTSUPP; 1280 if (pp->p.order + PAGE_SHIFT != ifq->niov_shift) 1281 return -EINVAL; 1282 if (pp->p.dma_dir != DMA_FROM_DEVICE) 1283 return -EOPNOTSUPP; 1284 1285 refcount_inc(&ifq->refs); 1286 return 0; 1287 } 1288 1289 static void io_pp_zc_destroy(struct page_pool *pp) 1290 { 1291 io_put_zcrx_ifq(io_pp_to_ifq(pp)); 1292 } 1293 1294 static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, 1295 struct netdev_rx_queue *rxq) 1296 { 1297 struct io_zcrx_ifq *ifq = mp_priv; 1298 struct nlattr *nest; 1299 int type; 1300 1301 type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; 1302 nest = nla_nest_start(rsp, type); 1303 if (!nest) 1304 return -EMSGSIZE; 1305 1306 if (nla_put_uint(rsp, NETDEV_A_IO_URING_PROVIDER_INFO_RX_BUF_LEN, 1307 1ULL << ifq->niov_shift)) { 1308 nla_nest_cancel(rsp, nest); 1309 return -EMSGSIZE; 1310 } 1311 1312 nla_nest_end(rsp, nest); 1313 1314 return 0; 1315 } 1316 1317 static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) 1318 { 1319 struct pp_memory_provider_params *p = &rxq->mp_params; 1320 struct io_zcrx_ifq *ifq = mp_priv; 1321 1322 io_zcrx_drop_netdev(ifq); 1323 if (ifq->area) 1324 io_zcrx_unmap_area(ifq, ifq->area); 1325 1326 p->mp_ops = NULL; 1327 p->mp_priv = NULL; 1328 } 1329 1330 static const struct memory_provider_ops io_uring_pp_zc_ops = { 1331 .alloc_netmems = io_pp_zc_alloc_netmems, 1332 .release_netmem = io_pp_zc_release_netmem, 1333 .init = io_pp_zc_init, 1334 .destroy = io_pp_zc_destroy, 1335 .nl_fill = io_pp_nl_fill, 1336 .uninstall = io_pp_uninstall, 1337 }; 1338 1339 static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, 1340 struct io_zcrx_ifq *zcrx, struct zcrx_rq *rq) 1341 { 1342 unsigned int mask = rq->nr_entries - 1; 1343 unsigned int i; 1344 1345 nr = min(nr, zcrx_rq_entries(rq)); 1346 for (i = 0; i < nr; i++) { 1347 struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask); 1348 struct net_iov *niov; 1349 1350 if (!io_parse_rqe(rqe, zcrx, &niov)) 1351 break; 1352 netmem_array[i] = net_iov_to_netmem(niov); 1353 } 1354 1355 smp_store_release(&rq->ring->head, rq->cached_head); 1356 return i; 1357 } 1358 1359 #define ZCRX_FLUSH_BATCH 32 1360 1361 static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr) 1362 { 1363 unsigned i; 1364 1365 for (i = 0; i < nr; i++) { 1366 netmem_ref netmem = netmems[i]; 1367 struct net_iov *niov = netmem_to_net_iov(netmem); 1368 1369 if (!io_zcrx_put_niov_uref(niov)) 1370 continue; 1371 if (!page_pool_unref_and_test(netmem)) 1372 continue; 1373 io_zcrx_return_niov(niov); 1374 } 1375 } 1376 1377 static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, 1378 struct zcrx_ctrl *ctrl) 1379 { 1380 struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush; 1381 netmem_ref netmems[ZCRX_FLUSH_BATCH]; 1382 unsigned total = 0; 1383 unsigned nr; 1384 1385 if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv))) 1386 return -EINVAL; 1387 1388 do { 1389 struct zcrx_rq *rq = &zcrx->rq; 1390 1391 scoped_guard(spinlock_bh, &rq->lock) { 1392 nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx, rq); 1393 zcrx_return_buffers(netmems, nr); 1394 } 1395 1396 total += nr; 1397 1398 if (fatal_signal_pending(current)) 1399 break; 1400 cond_resched(); 1401 } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq.nr_entries); 1402 1403 return 0; 1404 } 1405 1406 static int zcrx_arm_notif(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, 1407 struct zcrx_ctrl *ctrl) 1408 { 1409 const struct zcrx_ctrl_arm_notif *an = &ctrl->zc_arm_notif; 1410 unsigned type_mask; 1411 1412 if (an->notif_type >= __ZCRX_NOTIF_TYPE_LAST) 1413 return -EINVAL; 1414 if (!mem_is_zero(&an->__resv, sizeof(an->__resv))) 1415 return -EINVAL; 1416 1417 guard(spinlock_bh)(&zcrx->ctx_lock); 1418 type_mask = 1U << an->notif_type; 1419 if (type_mask & ~zcrx->fired_notifs) 1420 return -EINVAL; 1421 zcrx->fired_notifs &= ~type_mask; 1422 return 0; 1423 } 1424 1425 int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 1426 { 1427 struct zcrx_ctrl ctrl; 1428 struct io_zcrx_ifq *zcrx; 1429 1430 BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush)); 1431 BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_arm_notif)); 1432 1433 if (nr_args) 1434 return -EINVAL; 1435 if (copy_from_user(&ctrl, arg, sizeof(ctrl))) 1436 return -EFAULT; 1437 if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv))) 1438 return -EFAULT; 1439 1440 zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); 1441 if (!zcrx) 1442 return -ENXIO; 1443 1444 switch (ctrl.op) { 1445 case ZCRX_CTRL_FLUSH_RQ: 1446 return zcrx_flush_rq(ctx, zcrx, &ctrl); 1447 case ZCRX_CTRL_EXPORT: 1448 return zcrx_export(ctx, zcrx, &ctrl, arg); 1449 case ZCRX_CTRL_ARM_NOTIFICATION: 1450 return zcrx_arm_notif(ctx, zcrx, &ctrl); 1451 } 1452 1453 return -EOPNOTSUPP; 1454 } 1455 1456 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 1457 struct io_zcrx_ifq *ifq, int off, int len) 1458 { 1459 struct io_ring_ctx *ctx = req->ctx; 1460 struct io_uring_zcrx_cqe *rcqe; 1461 struct io_zcrx_area *area; 1462 struct io_uring_cqe *cqe; 1463 u64 offset; 1464 1465 if (!io_defer_get_uncommited_cqe(ctx, &cqe)) 1466 return false; 1467 1468 cqe->user_data = req->cqe.user_data; 1469 cqe->res = len; 1470 cqe->flags = IORING_CQE_F_MORE; 1471 if (ctx->flags & IORING_SETUP_CQE_MIXED) 1472 cqe->flags |= IORING_CQE_F_32; 1473 1474 area = io_zcrx_iov_to_area(niov); 1475 offset = off + (net_iov_idx(niov) << ifq->niov_shift); 1476 rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); 1477 rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); 1478 rcqe->__pad = 0; 1479 return true; 1480 } 1481 1482 static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq) 1483 { 1484 struct io_zcrx_area *area = ifq->area; 1485 struct net_iov *niov = NULL; 1486 1487 if (!ifq->kern_readable) 1488 return NULL; 1489 1490 scoped_guard(spinlock_bh, &area->freelist_lock) 1491 niov = zcrx_get_free_niov(area); 1492 1493 if (niov) 1494 page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); 1495 return niov; 1496 } 1497 1498 struct io_copy_cache { 1499 struct page *page; 1500 unsigned long offset; 1501 size_t size; 1502 }; 1503 1504 static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page, 1505 unsigned int src_offset, size_t len) 1506 { 1507 size_t copied = 0; 1508 1509 len = min(len, cc->size); 1510 1511 while (len) { 1512 void *src_addr, *dst_addr; 1513 struct page *dst_page = cc->page; 1514 unsigned dst_offset = cc->offset; 1515 size_t n = len; 1516 1517 if (folio_test_partial_kmap(page_folio(dst_page)) || 1518 folio_test_partial_kmap(page_folio(src_page))) { 1519 dst_page += dst_offset / PAGE_SIZE; 1520 dst_offset = offset_in_page(dst_offset); 1521 src_page += src_offset / PAGE_SIZE; 1522 src_offset = offset_in_page(src_offset); 1523 n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset); 1524 n = min(n, len); 1525 } 1526 1527 dst_addr = kmap_local_page(dst_page) + dst_offset; 1528 src_addr = kmap_local_page(src_page) + src_offset; 1529 1530 memcpy(dst_addr, src_addr, n); 1531 1532 kunmap_local(src_addr); 1533 kunmap_local(dst_addr); 1534 1535 cc->size -= n; 1536 cc->offset += n; 1537 src_offset += n; 1538 len -= n; 1539 copied += n; 1540 } 1541 return copied; 1542 } 1543 1544 static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1545 struct page *src_page, unsigned int src_offset, 1546 size_t len) 1547 { 1548 size_t copied = 0; 1549 int ret = 0; 1550 1551 while (len) { 1552 struct io_copy_cache cc; 1553 struct net_iov *niov; 1554 size_t n; 1555 1556 niov = io_alloc_fallback_niov(ifq); 1557 if (!niov) { 1558 ret = -ENOMEM; 1559 break; 1560 } 1561 1562 cc.page = io_zcrx_iov_page(niov); 1563 cc.offset = 0; 1564 cc.size = PAGE_SIZE; 1565 1566 n = io_copy_page(&cc, src_page, src_offset, len); 1567 1568 if (!io_zcrx_queue_cqe(req, niov, ifq, 0, n)) { 1569 io_zcrx_return_niov(niov); 1570 ret = -ENOSPC; 1571 break; 1572 } 1573 1574 io_zcrx_get_niov_uref(niov); 1575 src_offset += n; 1576 len -= n; 1577 copied += n; 1578 } 1579 1580 return copied ? copied : ret; 1581 } 1582 1583 static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1584 const skb_frag_t *frag, int off, int len) 1585 { 1586 struct page *page = skb_frag_page(frag); 1587 int ret; 1588 1589 ret = io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len); 1590 if (ret > 0) { 1591 if (ifq->notif_stats) { 1592 zcrx_stat_add(&ifq->notif_stats->copy_count, 1); 1593 zcrx_stat_add(&ifq->notif_stats->copy_bytes, ret); 1594 } 1595 zcrx_send_notif(ifq, ZCRX_NOTIF_COPY); 1596 } 1597 1598 return ret; 1599 } 1600 1601 static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1602 const skb_frag_t *frag, int off, int len) 1603 { 1604 struct net_iov *niov; 1605 struct page_pool *pp; 1606 1607 if (unlikely(!skb_frag_is_net_iov(frag))) 1608 return io_zcrx_copy_frag(req, ifq, frag, off, len); 1609 1610 niov = netmem_to_net_iov(frag->netmem); 1611 pp = niov->desc.pp; 1612 1613 if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq) 1614 return -EFAULT; 1615 1616 if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) 1617 return -ENOSPC; 1618 1619 /* 1620 * Prevent it from being recycled while user is accessing it. 1621 * It has to be done before grabbing a user reference. 1622 */ 1623 page_pool_ref_netmem(net_iov_to_netmem(niov)); 1624 io_zcrx_get_niov_uref(niov); 1625 return len; 1626 } 1627 1628 static int 1629 io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, 1630 unsigned int offset, size_t len) 1631 { 1632 struct io_zcrx_args *args = desc->arg.data; 1633 struct io_zcrx_ifq *ifq = args->ifq; 1634 struct io_kiocb *req = args->req; 1635 struct sk_buff *frag_iter; 1636 unsigned start, start_off = offset; 1637 int i, copy, end, off; 1638 int ret = 0; 1639 1640 len = min_t(size_t, len, desc->count); 1641 /* 1642 * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even 1643 * if desc->count is already 0. This is caused by the if (offset + 1 != 1644 * skb->len) check. Return early in this case to break out of 1645 * __tcp_read_sock(). 1646 */ 1647 if (!len) 1648 return 0; 1649 if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) 1650 return -EAGAIN; 1651 1652 if (unlikely(offset < skb_headlen(skb))) { 1653 ssize_t copied; 1654 size_t to_copy; 1655 1656 to_copy = min_t(size_t, skb_headlen(skb) - offset, len); 1657 copied = io_zcrx_copy_chunk(req, ifq, virt_to_page(skb->data), 1658 offset_in_page(skb->data) + offset, 1659 to_copy); 1660 if (copied < 0) { 1661 ret = copied; 1662 goto out; 1663 } 1664 offset += copied; 1665 len -= copied; 1666 if (!len) 1667 goto out; 1668 if (offset != skb_headlen(skb)) 1669 goto out; 1670 } 1671 1672 start = skb_headlen(skb); 1673 1674 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1675 const skb_frag_t *frag; 1676 1677 if (WARN_ON(start > offset + len)) 1678 return -EFAULT; 1679 1680 frag = &skb_shinfo(skb)->frags[i]; 1681 end = start + skb_frag_size(frag); 1682 1683 if (offset < end) { 1684 copy = end - offset; 1685 if (copy > len) 1686 copy = len; 1687 1688 off = offset - start; 1689 ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); 1690 if (ret < 0) 1691 goto out; 1692 1693 offset += ret; 1694 len -= ret; 1695 if (len == 0 || ret != copy) 1696 goto out; 1697 } 1698 start = end; 1699 } 1700 1701 skb_walk_frags(skb, frag_iter) { 1702 if (WARN_ON(start > offset + len)) 1703 return -EFAULT; 1704 1705 end = start + frag_iter->len; 1706 if (offset < end) { 1707 size_t count; 1708 1709 copy = end - offset; 1710 if (copy > len) 1711 copy = len; 1712 1713 off = offset - start; 1714 count = desc->count; 1715 ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); 1716 desc->count = count; 1717 if (ret < 0) 1718 goto out; 1719 1720 offset += ret; 1721 len -= ret; 1722 if (len == 0 || ret != copy) 1723 goto out; 1724 } 1725 start = end; 1726 } 1727 1728 out: 1729 if (offset == start_off) 1730 return ret; 1731 desc->count -= (offset - start_off); 1732 return offset - start_off; 1733 } 1734 1735 static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1736 struct sock *sk, int flags, 1737 unsigned issue_flags, unsigned int *outlen) 1738 { 1739 unsigned int len = *outlen; 1740 struct io_zcrx_args args = { 1741 .req = req, 1742 .ifq = ifq, 1743 }; 1744 read_descriptor_t rd_desc = { 1745 .count = len ? len : UINT_MAX, 1746 .arg.data = &args, 1747 }; 1748 int ret; 1749 1750 lock_sock(sk); 1751 ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); 1752 if (len && ret > 0) 1753 *outlen = len - ret; 1754 if (ret <= 0) { 1755 if (ret < 0 || sock_flag(sk, SOCK_DONE)) 1756 goto out; 1757 if (sk->sk_err) 1758 ret = sock_error(sk); 1759 else if (sk->sk_shutdown & RCV_SHUTDOWN) 1760 goto out; 1761 else if (sk->sk_state == TCP_CLOSE) 1762 ret = -ENOTCONN; 1763 else 1764 ret = -EAGAIN; 1765 } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) && 1766 (issue_flags & IO_URING_F_MULTISHOT)) { 1767 ret = IOU_REQUEUE; 1768 } else if (sock_flag(sk, SOCK_DONE)) { 1769 /* Make it to retry until it finally gets 0. */ 1770 if (issue_flags & IO_URING_F_MULTISHOT) 1771 ret = IOU_REQUEUE; 1772 else 1773 ret = -EAGAIN; 1774 } 1775 out: 1776 release_sock(sk); 1777 return ret; 1778 } 1779 1780 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1781 struct socket *sock, unsigned int flags, 1782 unsigned issue_flags, unsigned int *len) 1783 { 1784 struct sock *sk = sock->sk; 1785 const struct proto *prot = READ_ONCE(sk->sk_prot); 1786 1787 if (prot->recvmsg != tcp_recvmsg) 1788 return -EPROTONOSUPPORT; 1789 1790 sock_rps_record_flow(sk); 1791 return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); 1792 } 1793