1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/dma-map-ops.h> 5 #include <linux/mm.h> 6 #include <linux/nospec.h> 7 #include <linux/io_uring.h> 8 #include <linux/netdevice.h> 9 #include <linux/rtnetlink.h> 10 #include <linux/skbuff_ref.h> 11 #include <linux/anon_inodes.h> 12 13 #include <net/page_pool/helpers.h> 14 #include <net/page_pool/memory_provider.h> 15 #include <net/netlink.h> 16 #include <net/netdev_queues.h> 17 #include <net/netdev_rx_queue.h> 18 #include <net/tcp.h> 19 #include <net/rps.h> 20 21 #include <trace/events/page_pool.h> 22 23 #include <uapi/linux/io_uring.h> 24 25 #include "io_uring.h" 26 #include "kbuf.h" 27 #include "memmap.h" 28 #include "zcrx.h" 29 #include "rsrc.h" 30 31 #define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) 32 33 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) 34 35 static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) 36 { 37 return pp->mp_priv; 38 } 39 40 static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) 41 { 42 struct net_iov_area *owner = net_iov_owner(niov); 43 44 return container_of(owner, struct io_zcrx_area, nia); 45 } 46 47 static bool zcrx_set_ring_ctx(struct io_zcrx_ifq *zcrx, 48 struct io_ring_ctx *ctx) 49 { 50 guard(spinlock_bh)(&zcrx->ctx_lock); 51 if (zcrx->master_ctx) 52 return false; 53 percpu_ref_get(&ctx->refs); 54 zcrx->master_ctx = ctx; 55 return true; 56 } 57 58 static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) 59 { 60 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 61 unsigned niov_pages_shift; 62 63 lockdep_assert(!area->mem.is_dmabuf); 64 65 niov_pages_shift = area->ifq->niov_shift - PAGE_SHIFT; 66 return area->mem.pages[net_iov_idx(niov) << niov_pages_shift]; 67 } 68 69 static int io_area_max_shift(struct io_zcrx_mem *mem) 70 { 71 struct sg_table *sgt = mem->sgt; 72 struct scatterlist *sg; 73 unsigned shift = -1U; 74 unsigned i; 75 76 for_each_sgtable_dma_sg(sgt, sg, i) 77 shift = min(shift, __ffs(sg_dma_len(sg))); 78 return shift; 79 } 80 81 static int io_populate_area_dma(struct io_zcrx_ifq *ifq, 82 struct io_zcrx_area *area) 83 { 84 unsigned niov_size = 1U << ifq->niov_shift; 85 struct sg_table *sgt = area->mem.sgt; 86 struct scatterlist *sg; 87 unsigned i, niov_idx = 0; 88 89 for_each_sgtable_dma_sg(sgt, sg, i) { 90 dma_addr_t dma = sg_dma_address(sg); 91 unsigned long sg_len = sg_dma_len(sg); 92 93 if (WARN_ON_ONCE(sg_len % niov_size)) 94 return -EINVAL; 95 96 while (sg_len && niov_idx < area->nia.num_niovs) { 97 struct net_iov *niov = &area->nia.niovs[niov_idx]; 98 99 if (net_mp_niov_set_dma_addr(niov, dma)) 100 return -EFAULT; 101 sg_len -= niov_size; 102 dma += niov_size; 103 niov_idx++; 104 } 105 } 106 107 if (WARN_ON_ONCE(niov_idx != area->nia.num_niovs)) 108 return -EFAULT; 109 return 0; 110 } 111 112 static void io_release_dmabuf(struct io_zcrx_mem *mem) 113 { 114 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 115 return; 116 117 if (mem->sgt) 118 dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, 119 DMA_FROM_DEVICE); 120 if (mem->attach) 121 dma_buf_detach(mem->dmabuf, mem->attach); 122 if (mem->dmabuf) 123 dma_buf_put(mem->dmabuf); 124 125 mem->sgt = NULL; 126 mem->attach = NULL; 127 mem->dmabuf = NULL; 128 } 129 130 static int io_import_dmabuf(struct io_zcrx_ifq *ifq, 131 struct io_zcrx_mem *mem, 132 struct io_uring_zcrx_area_reg *area_reg) 133 { 134 unsigned long off = (unsigned long)area_reg->addr; 135 unsigned long len = (unsigned long)area_reg->len; 136 unsigned long total_size = 0; 137 struct scatterlist *sg; 138 int dmabuf_fd = area_reg->dmabuf_fd; 139 int i, ret; 140 141 if (!ifq->dev) 142 return -EINVAL; 143 if (off) 144 return -EINVAL; 145 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 146 return -EINVAL; 147 148 mem->is_dmabuf = true; 149 mem->dmabuf = dma_buf_get(dmabuf_fd); 150 if (IS_ERR(mem->dmabuf)) { 151 ret = PTR_ERR(mem->dmabuf); 152 mem->dmabuf = NULL; 153 goto err; 154 } 155 156 mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); 157 if (IS_ERR(mem->attach)) { 158 ret = PTR_ERR(mem->attach); 159 mem->attach = NULL; 160 goto err; 161 } 162 163 mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); 164 if (IS_ERR(mem->sgt)) { 165 ret = PTR_ERR(mem->sgt); 166 mem->sgt = NULL; 167 goto err; 168 } 169 170 for_each_sgtable_dma_sg(mem->sgt, sg, i) 171 total_size += sg_dma_len(sg); 172 173 if (total_size != len) { 174 ret = -EINVAL; 175 goto err; 176 } 177 178 mem->size = len; 179 return 0; 180 err: 181 io_release_dmabuf(mem); 182 return ret; 183 } 184 185 static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages) 186 { 187 struct folio *last_folio = NULL; 188 unsigned long res = 0; 189 int i; 190 191 for (i = 0; i < nr_pages; i++) { 192 struct folio *folio = page_folio(pages[i]); 193 194 if (folio == last_folio) 195 continue; 196 last_folio = folio; 197 res += folio_nr_pages(folio); 198 } 199 return res; 200 } 201 202 static int io_import_umem(struct io_zcrx_ifq *ifq, 203 struct io_zcrx_mem *mem, 204 struct io_uring_zcrx_area_reg *area_reg) 205 { 206 struct page **pages; 207 int nr_pages, ret; 208 bool mapped = false; 209 210 if (area_reg->dmabuf_fd) 211 return -EINVAL; 212 if (!area_reg->addr) 213 return -EFAULT; 214 pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, 215 &nr_pages); 216 if (IS_ERR(pages)) 217 return PTR_ERR(pages); 218 219 ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages, 220 0, (unsigned long)nr_pages << PAGE_SHIFT, 221 GFP_KERNEL_ACCOUNT); 222 if (ret) 223 goto out_err; 224 225 if (ifq->dev) { 226 ret = dma_map_sgtable(ifq->dev, &mem->page_sg_table, 227 DMA_FROM_DEVICE, IO_DMA_ATTR); 228 if (ret < 0) 229 goto out_err; 230 mapped = true; 231 } 232 233 mem->account_pages = io_count_account_pages(pages, nr_pages); 234 ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages); 235 if (ret < 0) { 236 mem->account_pages = 0; 237 goto out_err; 238 } 239 240 mem->sgt = &mem->page_sg_table; 241 mem->pages = pages; 242 mem->nr_folios = nr_pages; 243 mem->size = area_reg->len; 244 return ret; 245 out_err: 246 if (mapped) 247 dma_unmap_sgtable(ifq->dev, &mem->page_sg_table, 248 DMA_FROM_DEVICE, IO_DMA_ATTR); 249 sg_free_table(&mem->page_sg_table); 250 unpin_user_pages(pages, nr_pages); 251 kvfree(pages); 252 return ret; 253 } 254 255 static void io_release_area_mem(struct io_zcrx_mem *mem) 256 { 257 if (mem->is_dmabuf) { 258 io_release_dmabuf(mem); 259 } else if (mem->pages) { 260 unpin_user_pages(mem->pages, mem->nr_folios); 261 sg_free_table(mem->sgt); 262 kvfree(mem->pages); 263 } 264 mem->pages = IO_URING_PTR_POISON; 265 mem->sgt = IO_URING_PTR_POISON; 266 } 267 268 static int io_import_area(struct io_zcrx_ifq *ifq, 269 struct io_zcrx_mem *mem, 270 struct io_uring_zcrx_area_reg *area_reg) 271 { 272 int ret; 273 274 if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) 275 return -EINVAL; 276 if (area_reg->rq_area_token) 277 return -EINVAL; 278 if (area_reg->__resv2[0] || area_reg->__resv2[1]) 279 return -EINVAL; 280 281 ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); 282 if (ret) 283 return ret; 284 if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) 285 return -EINVAL; 286 287 if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) 288 return io_import_dmabuf(ifq, mem, area_reg); 289 return io_import_umem(ifq, mem, area_reg); 290 } 291 292 static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, 293 struct io_zcrx_area *area) 294 { 295 int i; 296 297 guard(mutex)(&ifq->pp_lock); 298 if (!area->is_mapped) 299 return; 300 area->is_mapped = false; 301 302 if (area->nia.niovs) { 303 for (i = 0; i < area->nia.num_niovs; i++) 304 net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); 305 } 306 307 if (area->mem.is_dmabuf) { 308 io_release_dmabuf(&area->mem); 309 } else { 310 dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table, 311 DMA_FROM_DEVICE, IO_DMA_ATTR); 312 } 313 } 314 315 static void zcrx_sync_for_device(struct page_pool *pp, struct io_zcrx_ifq *zcrx, 316 netmem_ref *netmems, unsigned nr) 317 { 318 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 319 struct device *dev = pp->p.dev; 320 unsigned i, niov_size; 321 dma_addr_t dma_addr; 322 323 if (!dma_dev_need_sync(dev)) 324 return; 325 niov_size = 1U << zcrx->niov_shift; 326 327 for (i = 0; i < nr; i++) { 328 dma_addr = page_pool_get_dma_addr_netmem(netmems[i]); 329 __dma_sync_single_for_device(dev, dma_addr + pp->p.offset, 330 niov_size, pp->p.dma_dir); 331 } 332 #endif 333 } 334 335 #define IO_RQ_MAX_ENTRIES 32768 336 337 #define IO_SKBS_PER_CALL_LIMIT 20 338 339 struct io_zcrx_args { 340 struct io_kiocb *req; 341 struct io_zcrx_ifq *ifq; 342 unsigned nr_skbs; 343 }; 344 345 static const struct memory_provider_ops io_uring_pp_zc_ops; 346 347 static inline atomic_t *io_get_user_counter(struct net_iov *niov) 348 { 349 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 350 351 return &area->user_refs[net_iov_idx(niov)]; 352 } 353 354 static bool io_zcrx_put_niov_uref(struct net_iov *niov) 355 { 356 atomic_t *uref = io_get_user_counter(niov); 357 int old; 358 359 old = atomic_read(uref); 360 do { 361 if (unlikely(old == 0)) 362 return false; 363 } while (!atomic_try_cmpxchg(uref, &old, old - 1)); 364 365 return true; 366 } 367 368 static void io_zcrx_get_niov_uref(struct net_iov *niov) 369 { 370 atomic_inc(io_get_user_counter(niov)); 371 } 372 373 static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets) 374 { 375 offsets->head = offsetof(struct io_uring, head); 376 offsets->tail = offsetof(struct io_uring, tail); 377 offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); 378 } 379 380 static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, 381 struct io_zcrx_ifq *ifq, 382 struct io_uring_zcrx_ifq_reg *reg, 383 struct io_uring_region_desc *rd, 384 u32 id) 385 { 386 u64 mmap_offset; 387 size_t off, size; 388 void *ptr; 389 int ret; 390 391 io_fill_zcrx_offsets(®->offsets); 392 off = reg->offsets.rqes; 393 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 394 if (size > rd->size) 395 return -EINVAL; 396 397 mmap_offset = IORING_MAP_OFF_ZCRX_REGION; 398 mmap_offset += (u64)id << IORING_OFF_ZCRX_SHIFT; 399 400 ret = io_create_region(ctx, &ifq->rq_region, rd, mmap_offset); 401 if (ret < 0) 402 return ret; 403 404 ptr = io_region_get_ptr(&ifq->rq_region); 405 ifq->rq.ring = (struct io_uring *)ptr; 406 ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 407 408 memset(ifq->rq.ring, 0, sizeof(*ifq->rq.ring)); 409 return 0; 410 } 411 412 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) 413 { 414 io_free_region(ifq->user, &ifq->rq_region); 415 ifq->rq.ring = IO_URING_PTR_POISON; 416 ifq->rq.rqes = IO_URING_PTR_POISON; 417 ifq->notif_stats = IO_URING_PTR_POISON; 418 } 419 420 static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, 421 struct io_zcrx_area *area) 422 { 423 io_zcrx_unmap_area(ifq, area); 424 io_release_area_mem(&area->mem); 425 426 if (area->mem.account_pages) 427 io_unaccount_mem(ifq->user, ifq->mm_account, 428 area->mem.account_pages); 429 430 kvfree(area->freelist); 431 kvfree(area->nia.niovs); 432 kvfree(area->user_refs); 433 kfree(area); 434 } 435 436 static int io_zcrx_append_area(struct io_zcrx_ifq *ifq, 437 struct io_zcrx_area *area) 438 { 439 bool kern_readable = !area->mem.is_dmabuf; 440 441 if (WARN_ON_ONCE(ifq->area)) 442 return -EINVAL; 443 if (WARN_ON_ONCE(ifq->kern_readable != kern_readable)) 444 return -EINVAL; 445 446 ifq->area = area; 447 return 0; 448 } 449 450 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, 451 struct io_uring_zcrx_area_reg *area_reg, 452 struct io_uring_zcrx_ifq_reg *reg) 453 { 454 int buf_size_shift = PAGE_SHIFT; 455 struct io_zcrx_area *area; 456 unsigned nr_iovs; 457 int i, ret; 458 459 if (reg->rx_buf_len) { 460 if (!is_power_of_2(reg->rx_buf_len) || 461 reg->rx_buf_len < PAGE_SIZE) 462 return -EINVAL; 463 buf_size_shift = ilog2(reg->rx_buf_len); 464 } 465 if (!ifq->dev && buf_size_shift != PAGE_SHIFT) 466 return -EOPNOTSUPP; 467 468 ret = -ENOMEM; 469 area = kzalloc_obj(*area); 470 if (!area) 471 goto err; 472 area->ifq = ifq; 473 474 ret = io_import_area(ifq, &area->mem, area_reg); 475 if (ret) 476 goto err; 477 if (ifq->dev) 478 area->is_mapped = true; 479 480 if (ifq->dev && buf_size_shift > io_area_max_shift(&area->mem)) { 481 ret = -ERANGE; 482 goto err; 483 } 484 485 ifq->niov_shift = buf_size_shift; 486 nr_iovs = area->mem.size >> ifq->niov_shift; 487 area->nia.num_niovs = nr_iovs; 488 489 ret = -ENOMEM; 490 area->nia.niovs = kvmalloc_objs(area->nia.niovs[0], nr_iovs, 491 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 492 if (!area->nia.niovs) 493 goto err; 494 495 area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), 496 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 497 if (!area->freelist) 498 goto err; 499 500 area->user_refs = kvmalloc_objs(area->user_refs[0], nr_iovs, 501 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 502 if (!area->user_refs) 503 goto err; 504 505 for (i = 0; i < nr_iovs; i++) { 506 struct net_iov *niov = &area->nia.niovs[i]; 507 508 net_iov_init(niov, &area->nia, NET_IOV_IOURING); 509 area->freelist[i] = i; 510 atomic_set(&area->user_refs[i], 0); 511 } 512 513 if (ifq->dev) { 514 ret = io_populate_area_dma(ifq, area); 515 if (ret) 516 goto err; 517 } 518 519 area->free_count = nr_iovs; 520 /* we're only supporting one area per ifq for now */ 521 area->area_id = 0; 522 area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; 523 spin_lock_init(&area->freelist_lock); 524 525 ret = io_zcrx_append_area(ifq, area); 526 if (!ret) 527 return 0; 528 err: 529 if (area) 530 io_zcrx_free_area(ifq, area); 531 return ret; 532 } 533 534 static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) 535 { 536 struct io_zcrx_ifq *ifq; 537 538 ifq = kzalloc_obj(*ifq); 539 if (!ifq) 540 return NULL; 541 542 ifq->if_rxq = -1; 543 spin_lock_init(&ifq->ctx_lock); 544 spin_lock_init(&ifq->rq.lock); 545 mutex_init(&ifq->pp_lock); 546 refcount_set(&ifq->refs, 1); 547 refcount_set(&ifq->user_refs, 1); 548 return ifq; 549 } 550 551 static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) 552 { 553 guard(mutex)(&ifq->pp_lock); 554 555 if (!ifq->netdev) 556 return; 557 netdev_put(ifq->netdev, &ifq->netdev_tracker); 558 ifq->netdev = NULL; 559 } 560 561 static void io_close_queue(struct io_zcrx_ifq *ifq) 562 { 563 struct net_device *netdev; 564 netdevice_tracker netdev_tracker; 565 struct pp_memory_provider_params p = { 566 .mp_ops = &io_uring_pp_zc_ops, 567 .mp_priv = ifq, 568 }; 569 570 scoped_guard(mutex, &ifq->pp_lock) { 571 netdev = ifq->netdev; 572 netdev_tracker = ifq->netdev_tracker; 573 ifq->netdev = NULL; 574 } 575 576 if (netdev) { 577 if (ifq->if_rxq != -1) { 578 netdev_lock(netdev); 579 netif_mp_close_rxq(netdev, ifq->if_rxq, &p); 580 netdev_unlock(netdev); 581 } 582 netdev_put(netdev, &netdev_tracker); 583 } 584 ifq->if_rxq = -1; 585 } 586 587 static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) 588 { 589 if (WARN_ON_ONCE(ifq->if_rxq != -1)) 590 return; 591 if (WARN_ON_ONCE(ifq->netdev != NULL)) 592 return; 593 if (WARN_ON_ONCE(ifq->master_ctx)) 594 return; 595 596 if (ifq->area) 597 io_zcrx_free_area(ifq, ifq->area); 598 if (ifq->mm_account) 599 mmdrop(ifq->mm_account); 600 if (ifq->dev) 601 put_device(ifq->dev); 602 603 io_free_rbuf_ring(ifq); 604 free_uid(ifq->user); 605 mutex_destroy(&ifq->pp_lock); 606 kfree(ifq); 607 } 608 609 static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq) 610 { 611 if (refcount_dec_and_test(&ifq->refs)) 612 io_zcrx_ifq_free(ifq); 613 } 614 615 static void io_zcrx_return_niov_freelist(struct net_iov *niov) 616 { 617 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 618 619 guard(spinlock_bh)(&area->freelist_lock); 620 if (WARN_ON_ONCE(area->free_count >= area->nia.num_niovs)) 621 return; 622 area->freelist[area->free_count++] = net_iov_idx(niov); 623 } 624 625 static struct net_iov *zcrx_get_free_niov(struct io_zcrx_area *area) 626 { 627 unsigned niov_idx; 628 629 lockdep_assert_held(&area->freelist_lock); 630 631 if (unlikely(!area->free_count)) 632 return NULL; 633 634 niov_idx = area->freelist[--area->free_count]; 635 return &area->nia.niovs[niov_idx]; 636 } 637 638 static void io_zcrx_return_niov(struct net_iov *niov) 639 { 640 netmem_ref netmem = net_iov_to_netmem(niov); 641 642 if (!niov->desc.pp) { 643 /* copy fallback allocated niovs */ 644 io_zcrx_return_niov_freelist(niov); 645 return; 646 } 647 page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); 648 } 649 650 static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 651 { 652 struct io_zcrx_area *area = ifq->area; 653 int i; 654 655 if (!area) 656 return; 657 658 /* Reclaim back all buffers given to the user space. */ 659 for (i = 0; i < area->nia.num_niovs; i++) { 660 struct net_iov *niov = &area->nia.niovs[i]; 661 int nr; 662 663 if (!atomic_read(io_get_user_counter(niov))) 664 continue; 665 nr = atomic_xchg(io_get_user_counter(niov), 0); 666 if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 667 io_zcrx_return_niov(niov); 668 } 669 } 670 671 static void zcrx_unregister_user(struct io_zcrx_ifq *ifq, struct io_ring_ctx *ctx) 672 { 673 scoped_guard(spinlock_bh, &ifq->ctx_lock) { 674 if (ctx && ifq->master_ctx == ctx) { 675 ifq->master_ctx = NULL; 676 percpu_ref_put(&ctx->refs); 677 } 678 } 679 680 if (refcount_dec_and_test(&ifq->user_refs)) { 681 io_close_queue(ifq); 682 io_zcrx_scrub(ifq); 683 } 684 } 685 686 static void zcrx_unregister(struct io_zcrx_ifq *ifq, struct io_ring_ctx *ctx) 687 { 688 zcrx_unregister_user(ifq, ctx); 689 io_put_zcrx_ifq(ifq); 690 } 691 692 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, 693 unsigned int id) 694 { 695 struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); 696 697 lockdep_assert_held(&ctx->mmap_lock); 698 699 return ifq ? &ifq->rq_region : NULL; 700 } 701 702 static int zcrx_box_release(struct inode *inode, struct file *file) 703 { 704 struct io_zcrx_ifq *ifq = file->private_data; 705 706 if (WARN_ON_ONCE(!ifq)) 707 return -EFAULT; 708 zcrx_unregister(ifq, NULL); 709 return 0; 710 } 711 712 static const struct file_operations zcrx_box_fops = { 713 .owner = THIS_MODULE, 714 .release = zcrx_box_release, 715 }; 716 717 static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, 718 struct zcrx_ctrl *ctrl, void __user *arg) 719 { 720 struct zcrx_ctrl_export *ce = &ctrl->zc_export; 721 struct file *file; 722 int fd; 723 724 if (!mem_is_zero(ce, sizeof(*ce))) 725 return -EINVAL; 726 727 refcount_inc(&ifq->refs); 728 refcount_inc(&ifq->user_refs); 729 730 file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops, 731 ifq, O_CLOEXEC, NULL); 732 if (IS_ERR(file)) { 733 zcrx_unregister(ifq, NULL); 734 return PTR_ERR(file); 735 } 736 737 fd = get_unused_fd_flags(O_CLOEXEC); 738 if (fd < 0) { 739 fput(file); 740 return fd; 741 } 742 743 ce->zcrx_fd = fd; 744 if (copy_to_user(arg, ctrl, sizeof(*ctrl))) { 745 fput(file); 746 put_unused_fd(fd); 747 return -EFAULT; 748 } 749 750 fd_install(fd, file); 751 return 0; 752 } 753 754 static int import_zcrx(struct io_ring_ctx *ctx, 755 struct io_uring_zcrx_ifq_reg __user *arg, 756 struct io_uring_zcrx_ifq_reg *reg) 757 { 758 struct io_zcrx_ifq *ifq; 759 struct file *file; 760 int fd, ret; 761 u32 id; 762 763 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 764 return -EINVAL; 765 if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) 766 return -EINVAL; 767 if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) 768 return -EINVAL; 769 if (reg->notif_desc) 770 return -EINVAL; 771 if (reg->flags & ~ZCRX_REG_IMPORT) 772 return -EINVAL; 773 774 fd = reg->if_idx; 775 CLASS(fd, f)(fd); 776 if (fd_empty(f)) 777 return -EBADF; 778 779 file = fd_file(f); 780 if (file->f_op != &zcrx_box_fops || !file->private_data) 781 return -EBADF; 782 783 ifq = file->private_data; 784 refcount_inc(&ifq->refs); 785 refcount_inc(&ifq->user_refs); 786 787 scoped_guard(mutex, &ctx->mmap_lock) { 788 ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); 789 if (ret) 790 goto err; 791 } 792 793 reg->zcrx_id = id; 794 io_fill_zcrx_offsets(®->offsets); 795 if (copy_to_user(arg, reg, sizeof(*reg))) { 796 ret = -EFAULT; 797 goto err_xa_erase; 798 } 799 800 scoped_guard(mutex, &ctx->mmap_lock) { 801 ret = -ENOMEM; 802 if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) 803 goto err_xa_erase; 804 } 805 806 return 0; 807 err_xa_erase: 808 scoped_guard(mutex, &ctx->mmap_lock) 809 xa_erase(&ctx->zcrx_ctxs, id); 810 err: 811 zcrx_unregister(ifq, ctx); 812 return ret; 813 } 814 815 static int zcrx_register_netdev(struct io_zcrx_ifq *ifq, 816 struct io_uring_zcrx_ifq_reg *reg, 817 struct io_uring_zcrx_area_reg *area) 818 { 819 struct pp_memory_provider_params mp_param = {}; 820 unsigned if_rxq = reg->if_rxq; 821 int ret; 822 823 ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, 824 reg->if_idx); 825 if (!ifq->netdev) 826 return -ENODEV; 827 828 netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); 829 830 ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, if_rxq, NETDEV_QUEUE_TYPE_RX); 831 if (!ifq->dev) { 832 ret = -EOPNOTSUPP; 833 goto netdev_put_unlock; 834 } 835 get_device(ifq->dev); 836 837 ret = io_zcrx_create_area(ifq, area, reg); 838 if (ret) 839 goto netdev_put_unlock; 840 841 if (reg->rx_buf_len) 842 mp_param.rx_page_size = 1U << ifq->niov_shift; 843 mp_param.mp_ops = &io_uring_pp_zc_ops; 844 mp_param.mp_priv = ifq; 845 ret = netif_mp_open_rxq(ifq->netdev, if_rxq, &mp_param, NULL); 846 if (ret) 847 goto netdev_put_unlock; 848 849 ifq->if_rxq = if_rxq; 850 ret = 0; 851 netdev_put_unlock: 852 netdev_unlock(ifq->netdev); 853 return ret; 854 } 855 856 static int zcrx_validate_notif_stats(struct io_zcrx_ifq *ifq, 857 const struct io_uring_zcrx_ifq_reg *reg, 858 const struct zcrx_notification_desc *notif) 859 { 860 size_t stats_off = notif->stats_offset; 861 size_t used, end; 862 863 used = reg->offsets.rqes + 864 sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 865 866 if (!IS_ALIGNED(stats_off, __alignof__(struct zcrx_notif_stats))) 867 return -EINVAL; 868 if (stats_off < used) 869 return -ERANGE; 870 if (check_add_overflow(stats_off, 871 sizeof(struct zcrx_notif_stats), 872 &end)) 873 return -ERANGE; 874 if (end > io_region_size(&ifq->rq_region)) 875 return -ERANGE; 876 877 ifq->notif_stats = io_region_get_ptr(&ifq->rq_region) + stats_off; 878 memset(ifq->notif_stats, 0, sizeof(*ifq->notif_stats)); 879 880 return 0; 881 } 882 883 int io_register_zcrx(struct io_ring_ctx *ctx, 884 struct io_uring_zcrx_ifq_reg __user *arg) 885 { 886 struct zcrx_notification_desc notif; 887 struct io_uring_zcrx_area_reg area; 888 struct io_uring_zcrx_ifq_reg reg; 889 struct io_uring_region_desc rd; 890 struct io_zcrx_ifq *ifq; 891 int ret; 892 u32 id; 893 894 /* 895 * 1. Interface queue allocation. 896 * 2. It can observe data destined for sockets of other tasks. 897 */ 898 if (!capable(CAP_NET_ADMIN)) 899 return -EPERM; 900 901 /* mandatory io_uring features for zc rx */ 902 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 903 return -EINVAL; 904 if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) 905 return -EINVAL; 906 if (copy_from_user(®, arg, sizeof(reg))) 907 return -EFAULT; 908 if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.zcrx_id) 909 return -EINVAL; 910 if (reg.flags & ~ZCRX_SUPPORTED_REG_FLAGS) 911 return -EINVAL; 912 if (reg.flags & ZCRX_REG_IMPORT) 913 return import_zcrx(ctx, arg, ®); 914 if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 915 return -EFAULT; 916 if (reg.if_rxq == -1 || !reg.rq_entries) 917 return -EINVAL; 918 if ((reg.if_rxq || reg.if_idx) && (reg.flags & ZCRX_REG_NODEV)) 919 return -EINVAL; 920 if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { 921 if (!(ctx->flags & IORING_SETUP_CLAMP)) 922 return -EINVAL; 923 reg.rq_entries = IO_RQ_MAX_ENTRIES; 924 } 925 reg.rq_entries = roundup_pow_of_two(reg.rq_entries); 926 927 if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) 928 return -EFAULT; 929 930 memset(¬if, 0, sizeof(notif)); 931 if (reg.notif_desc && copy_from_user(¬if, u64_to_user_ptr(reg.notif_desc), 932 sizeof(notif))) 933 return -EFAULT; 934 if (notif.type_mask & ~ZCRX_NOTIF_TYPE_MASK) 935 return -EINVAL; 936 if (notif.flags & ~ZCRX_NOTIF_DESC_FLAG_STATS) 937 return -EINVAL; 938 if (!(notif.flags & ZCRX_NOTIF_DESC_FLAG_STATS)) { 939 if (notif.stats_offset) 940 return -EINVAL; 941 } 942 if (!mem_is_zero(¬if.__resv2, sizeof(notif.__resv2))) 943 return -EINVAL; 944 945 ifq = io_zcrx_ifq_alloc(ctx); 946 if (!ifq) 947 return -ENOMEM; 948 949 ifq->notif_data = notif.user_data; 950 ifq->allowed_notif_mask = notif.type_mask; 951 952 if (ctx->user) { 953 get_uid(ctx->user); 954 ifq->user = ctx->user; 955 } 956 if (ctx->mm_account) { 957 mmgrab(ctx->mm_account); 958 ifq->mm_account = ctx->mm_account; 959 } 960 ifq->rq.nr_entries = reg.rq_entries; 961 962 scoped_guard(mutex, &ctx->mmap_lock) { 963 /* preallocate id */ 964 ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); 965 if (ret) 966 goto ifq_free; 967 } 968 969 ret = io_allocate_rbuf_ring(ctx, ifq, ®, &rd, id); 970 if (ret) 971 goto err; 972 973 if (notif.flags & ZCRX_NOTIF_DESC_FLAG_STATS) { 974 ret = zcrx_validate_notif_stats(ifq, ®, ¬if); 975 if (ret) 976 goto err; 977 } 978 979 ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF); 980 981 if (!(reg.flags & ZCRX_REG_NODEV)) { 982 ret = zcrx_register_netdev(ifq, ®, &area); 983 if (ret) 984 goto err; 985 } else { 986 ret = io_zcrx_create_area(ifq, &area, ®); 987 if (ret) 988 goto err; 989 } 990 991 reg.zcrx_id = id; 992 993 scoped_guard(mutex, &ctx->mmap_lock) { 994 /* publish ifq */ 995 ret = -ENOMEM; 996 if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) 997 goto err; 998 } 999 1000 reg.rx_buf_len = 1U << ifq->niov_shift; 1001 1002 if (copy_to_user(arg, ®, sizeof(reg)) || 1003 copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || 1004 copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { 1005 ret = -EFAULT; 1006 goto err; 1007 } 1008 1009 if (notif.type_mask) 1010 zcrx_set_ring_ctx(ifq, ctx); 1011 return 0; 1012 err: 1013 scoped_guard(mutex, &ctx->mmap_lock) 1014 xa_erase(&ctx->zcrx_ctxs, id); 1015 ifq_free: 1016 zcrx_unregister(ifq, ctx); 1017 return ret; 1018 } 1019 1020 static inline bool is_zcrx_entry_marked(struct io_ring_ctx *ctx, unsigned long id) 1021 { 1022 return xa_get_mark(&ctx->zcrx_ctxs, id, XA_MARK_1); 1023 } 1024 1025 static inline void set_zcrx_entry_mark(struct io_ring_ctx *ctx, unsigned long id) 1026 { 1027 xa_set_mark(&ctx->zcrx_ctxs, id, XA_MARK_1); 1028 } 1029 1030 void io_terminate_zcrx(struct io_ring_ctx *ctx) 1031 { 1032 struct io_zcrx_ifq *ifq; 1033 unsigned long id = 0; 1034 1035 lockdep_assert_held(&ctx->uring_lock); 1036 1037 while (1) { 1038 scoped_guard(mutex, &ctx->mmap_lock) 1039 ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); 1040 if (!ifq) 1041 break; 1042 if (WARN_ON_ONCE(is_zcrx_entry_marked(ctx, id))) 1043 break; 1044 set_zcrx_entry_mark(ctx, id); 1045 id++; 1046 zcrx_unregister_user(ifq, ctx); 1047 } 1048 } 1049 1050 void io_unregister_zcrx(struct io_ring_ctx *ctx) 1051 { 1052 struct io_zcrx_ifq *ifq; 1053 1054 lockdep_assert_held(&ctx->uring_lock); 1055 1056 while (1) { 1057 scoped_guard(mutex, &ctx->mmap_lock) { 1058 unsigned long id = 0; 1059 1060 ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); 1061 if (ifq) { 1062 if (WARN_ON_ONCE(!is_zcrx_entry_marked(ctx, id))) { 1063 ifq = NULL; 1064 break; 1065 } 1066 xa_erase(&ctx->zcrx_ctxs, id); 1067 } 1068 } 1069 if (!ifq) 1070 break; 1071 /* 1072 * io_uring can run requests and return buffers to the user 1073 * after termination, scrub it again. 1074 */ 1075 if (refcount_read(&ifq->user_refs) == 0) 1076 io_zcrx_scrub(ifq); 1077 io_put_zcrx_ifq(ifq); 1078 } 1079 1080 xa_destroy(&ctx->zcrx_ctxs); 1081 } 1082 1083 static inline u32 zcrx_rq_entries(struct zcrx_rq *rq) 1084 { 1085 u32 entries; 1086 1087 entries = smp_load_acquire(&rq->ring->tail) - rq->cached_head; 1088 return min(entries, rq->nr_entries); 1089 } 1090 1091 static struct io_uring_zcrx_rqe *zcrx_next_rqe(struct zcrx_rq *rq, unsigned mask) 1092 { 1093 unsigned int idx = rq->cached_head++ & mask; 1094 1095 return &rq->rqes[idx]; 1096 } 1097 1098 static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, 1099 struct io_zcrx_ifq *ifq, 1100 struct net_iov **ret_niov) 1101 { 1102 __u64 off = READ_ONCE(rqe->off); 1103 unsigned niov_idx, area_idx; 1104 struct io_zcrx_area *area; 1105 1106 area_idx = off >> IORING_ZCRX_AREA_SHIFT; 1107 niov_idx = (off & ~IORING_ZCRX_AREA_MASK) >> ifq->niov_shift; 1108 1109 if (unlikely(rqe->__pad || area_idx)) 1110 return false; 1111 area = ifq->area; 1112 1113 if (unlikely(niov_idx >= area->nia.num_niovs)) 1114 return false; 1115 niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); 1116 1117 *ret_niov = &area->nia.niovs[niov_idx]; 1118 return true; 1119 } 1120 1121 static unsigned io_zcrx_ring_refill(struct page_pool *pp, 1122 struct io_zcrx_ifq *ifq, 1123 netmem_ref *netmems, unsigned to_alloc) 1124 { 1125 struct zcrx_rq *rq = &ifq->rq; 1126 unsigned int mask = rq->nr_entries - 1; 1127 unsigned int entries; 1128 unsigned allocated = 0; 1129 1130 guard(spinlock_bh)(&rq->lock); 1131 1132 entries = zcrx_rq_entries(rq); 1133 entries = min_t(unsigned, entries, to_alloc); 1134 if (unlikely(!entries)) 1135 return 0; 1136 1137 do { 1138 struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask); 1139 struct net_iov *niov; 1140 netmem_ref netmem; 1141 1142 if (!io_parse_rqe(rqe, ifq, &niov)) 1143 continue; 1144 if (!io_zcrx_put_niov_uref(niov)) 1145 continue; 1146 1147 netmem = net_iov_to_netmem(niov); 1148 if (!page_pool_unref_and_test(netmem)) 1149 continue; 1150 1151 if (unlikely(niov->desc.pp != pp)) { 1152 io_zcrx_return_niov(niov); 1153 continue; 1154 } 1155 1156 netmems[allocated] = netmem; 1157 allocated++; 1158 } while (--entries); 1159 1160 smp_store_release(&rq->ring->head, rq->cached_head); 1161 return allocated; 1162 } 1163 1164 static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq, 1165 netmem_ref *netmems, unsigned to_alloc) 1166 { 1167 struct io_zcrx_area *area = ifq->area; 1168 unsigned allocated = 0; 1169 1170 guard(spinlock_bh)(&area->freelist_lock); 1171 1172 for (allocated = 0; allocated < to_alloc; allocated++) { 1173 struct net_iov *niov = zcrx_get_free_niov(area); 1174 1175 if (!niov) 1176 break; 1177 net_mp_niov_set_page_pool(pp, niov); 1178 netmems[allocated] = net_iov_to_netmem(niov); 1179 } 1180 return allocated; 1181 } 1182 1183 static void zcrx_notif_tw(struct io_tw_req tw_req, io_tw_token_t tw) 1184 { 1185 struct io_kiocb *req = tw_req.req; 1186 struct io_ring_ctx *ctx = req->ctx; 1187 1188 io_post_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, 0); 1189 percpu_ref_put(&ctx->refs); 1190 io_poison_req(req); 1191 kmem_cache_free(req_cachep, req); 1192 } 1193 1194 static void zcrx_stat_add(__u64 *p, s64 v) 1195 { 1196 WRITE_ONCE(*p, READ_ONCE(*p) + v); 1197 } 1198 1199 static void zcrx_send_notif(struct io_zcrx_ifq *ifq, unsigned type) 1200 { 1201 gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO; 1202 u32 type_mask = 1 << type; 1203 struct io_kiocb *req; 1204 1205 if (!(type_mask & ifq->allowed_notif_mask)) 1206 return; 1207 1208 guard(spinlock_bh)(&ifq->ctx_lock); 1209 if (!ifq->master_ctx) 1210 return; 1211 if (type_mask & ifq->fired_notifs) 1212 return; 1213 1214 req = kmem_cache_alloc(req_cachep, gfp); 1215 if (unlikely(!req)) 1216 return; 1217 1218 ifq->fired_notifs |= type_mask; 1219 1220 req->opcode = IORING_OP_NOP; 1221 req->cqe.user_data = ifq->notif_data; 1222 req->cqe.res = type; 1223 req->ctx = ifq->master_ctx; 1224 percpu_ref_get(&req->ctx->refs); 1225 req->tctx = NULL; 1226 req->io_task_work.func = zcrx_notif_tw; 1227 io_req_task_work_add(req); 1228 } 1229 1230 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) 1231 { 1232 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 1233 netmem_ref *netmems = pp->alloc.cache; 1234 unsigned to_alloc = PP_ALLOC_CACHE_REFILL; 1235 unsigned allocated; 1236 1237 /* pp should already be ensuring that */ 1238 if (WARN_ON_ONCE(pp->alloc.count)) 1239 return 0; 1240 1241 allocated = io_zcrx_ring_refill(pp, ifq, netmems, to_alloc); 1242 if (likely(allocated)) 1243 goto out_return; 1244 1245 allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc); 1246 if (!allocated) { 1247 zcrx_send_notif(ifq, ZCRX_NOTIF_NO_BUFFERS); 1248 return 0; 1249 } 1250 out_return: 1251 zcrx_sync_for_device(pp, ifq, netmems, allocated); 1252 allocated--; 1253 pp->alloc.count += allocated; 1254 return netmems[allocated]; 1255 } 1256 1257 static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) 1258 { 1259 struct net_iov *niov; 1260 1261 if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 1262 return false; 1263 1264 niov = netmem_to_net_iov(netmem); 1265 net_mp_niov_clear_page_pool(niov); 1266 io_zcrx_return_niov_freelist(niov); 1267 return false; 1268 } 1269 1270 static int io_pp_zc_init(struct page_pool *pp) 1271 { 1272 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 1273 1274 if (WARN_ON_ONCE(!ifq)) 1275 return -EINVAL; 1276 if (WARN_ON_ONCE(ifq->dev != pp->p.dev)) 1277 return -EINVAL; 1278 if (WARN_ON_ONCE(!pp->dma_map)) 1279 return -EOPNOTSUPP; 1280 if (pp->p.order + PAGE_SHIFT != ifq->niov_shift) 1281 return -EINVAL; 1282 if (pp->p.dma_dir != DMA_FROM_DEVICE) 1283 return -EOPNOTSUPP; 1284 1285 refcount_inc(&ifq->refs); 1286 return 0; 1287 } 1288 1289 static void io_pp_zc_destroy(struct page_pool *pp) 1290 { 1291 io_put_zcrx_ifq(io_pp_to_ifq(pp)); 1292 } 1293 1294 static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, 1295 struct netdev_rx_queue *rxq) 1296 { 1297 struct nlattr *nest; 1298 int type; 1299 1300 type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; 1301 nest = nla_nest_start(rsp, type); 1302 if (!nest) 1303 return -EMSGSIZE; 1304 nla_nest_end(rsp, nest); 1305 1306 return 0; 1307 } 1308 1309 static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) 1310 { 1311 struct pp_memory_provider_params *p = &rxq->mp_params; 1312 struct io_zcrx_ifq *ifq = mp_priv; 1313 1314 io_zcrx_drop_netdev(ifq); 1315 if (ifq->area) 1316 io_zcrx_unmap_area(ifq, ifq->area); 1317 1318 p->mp_ops = NULL; 1319 p->mp_priv = NULL; 1320 } 1321 1322 static const struct memory_provider_ops io_uring_pp_zc_ops = { 1323 .alloc_netmems = io_pp_zc_alloc_netmems, 1324 .release_netmem = io_pp_zc_release_netmem, 1325 .init = io_pp_zc_init, 1326 .destroy = io_pp_zc_destroy, 1327 .nl_fill = io_pp_nl_fill, 1328 .uninstall = io_pp_uninstall, 1329 }; 1330 1331 static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, 1332 struct io_zcrx_ifq *zcrx, struct zcrx_rq *rq) 1333 { 1334 unsigned int mask = rq->nr_entries - 1; 1335 unsigned int i; 1336 1337 nr = min(nr, zcrx_rq_entries(rq)); 1338 for (i = 0; i < nr; i++) { 1339 struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask); 1340 struct net_iov *niov; 1341 1342 if (!io_parse_rqe(rqe, zcrx, &niov)) 1343 break; 1344 netmem_array[i] = net_iov_to_netmem(niov); 1345 } 1346 1347 smp_store_release(&rq->ring->head, rq->cached_head); 1348 return i; 1349 } 1350 1351 #define ZCRX_FLUSH_BATCH 32 1352 1353 static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr) 1354 { 1355 unsigned i; 1356 1357 for (i = 0; i < nr; i++) { 1358 netmem_ref netmem = netmems[i]; 1359 struct net_iov *niov = netmem_to_net_iov(netmem); 1360 1361 if (!io_zcrx_put_niov_uref(niov)) 1362 continue; 1363 if (!page_pool_unref_and_test(netmem)) 1364 continue; 1365 io_zcrx_return_niov(niov); 1366 } 1367 } 1368 1369 static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, 1370 struct zcrx_ctrl *ctrl) 1371 { 1372 struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush; 1373 netmem_ref netmems[ZCRX_FLUSH_BATCH]; 1374 unsigned total = 0; 1375 unsigned nr; 1376 1377 if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv))) 1378 return -EINVAL; 1379 1380 do { 1381 struct zcrx_rq *rq = &zcrx->rq; 1382 1383 scoped_guard(spinlock_bh, &rq->lock) { 1384 nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx, rq); 1385 zcrx_return_buffers(netmems, nr); 1386 } 1387 1388 total += nr; 1389 1390 if (fatal_signal_pending(current)) 1391 break; 1392 cond_resched(); 1393 } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq.nr_entries); 1394 1395 return 0; 1396 } 1397 1398 static int zcrx_arm_notif(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, 1399 struct zcrx_ctrl *ctrl) 1400 { 1401 const struct zcrx_ctrl_arm_notif *an = &ctrl->zc_arm_notif; 1402 unsigned type_mask; 1403 1404 if (an->notif_type >= __ZCRX_NOTIF_TYPE_LAST) 1405 return -EINVAL; 1406 if (!mem_is_zero(&an->__resv, sizeof(an->__resv))) 1407 return -EINVAL; 1408 1409 guard(spinlock_bh)(&zcrx->ctx_lock); 1410 type_mask = 1U << an->notif_type; 1411 if (type_mask & ~zcrx->fired_notifs) 1412 return -EINVAL; 1413 zcrx->fired_notifs &= ~type_mask; 1414 return 0; 1415 } 1416 1417 int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 1418 { 1419 struct zcrx_ctrl ctrl; 1420 struct io_zcrx_ifq *zcrx; 1421 1422 BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush)); 1423 BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_arm_notif)); 1424 1425 if (nr_args) 1426 return -EINVAL; 1427 if (copy_from_user(&ctrl, arg, sizeof(ctrl))) 1428 return -EFAULT; 1429 if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv))) 1430 return -EFAULT; 1431 1432 zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); 1433 if (!zcrx) 1434 return -ENXIO; 1435 1436 switch (ctrl.op) { 1437 case ZCRX_CTRL_FLUSH_RQ: 1438 return zcrx_flush_rq(ctx, zcrx, &ctrl); 1439 case ZCRX_CTRL_EXPORT: 1440 return zcrx_export(ctx, zcrx, &ctrl, arg); 1441 case ZCRX_CTRL_ARM_NOTIFICATION: 1442 return zcrx_arm_notif(ctx, zcrx, &ctrl); 1443 } 1444 1445 return -EOPNOTSUPP; 1446 } 1447 1448 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 1449 struct io_zcrx_ifq *ifq, int off, int len) 1450 { 1451 struct io_ring_ctx *ctx = req->ctx; 1452 struct io_uring_zcrx_cqe *rcqe; 1453 struct io_zcrx_area *area; 1454 struct io_uring_cqe *cqe; 1455 u64 offset; 1456 1457 if (!io_defer_get_uncommited_cqe(ctx, &cqe)) 1458 return false; 1459 1460 cqe->user_data = req->cqe.user_data; 1461 cqe->res = len; 1462 cqe->flags = IORING_CQE_F_MORE; 1463 if (ctx->flags & IORING_SETUP_CQE_MIXED) 1464 cqe->flags |= IORING_CQE_F_32; 1465 1466 area = io_zcrx_iov_to_area(niov); 1467 offset = off + (net_iov_idx(niov) << ifq->niov_shift); 1468 rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); 1469 rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); 1470 rcqe->__pad = 0; 1471 return true; 1472 } 1473 1474 static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq) 1475 { 1476 struct io_zcrx_area *area = ifq->area; 1477 struct net_iov *niov = NULL; 1478 1479 if (!ifq->kern_readable) 1480 return NULL; 1481 1482 scoped_guard(spinlock_bh, &area->freelist_lock) 1483 niov = zcrx_get_free_niov(area); 1484 1485 if (niov) 1486 page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); 1487 return niov; 1488 } 1489 1490 struct io_copy_cache { 1491 struct page *page; 1492 unsigned long offset; 1493 size_t size; 1494 }; 1495 1496 static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page, 1497 unsigned int src_offset, size_t len) 1498 { 1499 size_t copied = 0; 1500 1501 len = min(len, cc->size); 1502 1503 while (len) { 1504 void *src_addr, *dst_addr; 1505 struct page *dst_page = cc->page; 1506 unsigned dst_offset = cc->offset; 1507 size_t n = len; 1508 1509 if (folio_test_partial_kmap(page_folio(dst_page)) || 1510 folio_test_partial_kmap(page_folio(src_page))) { 1511 dst_page += dst_offset / PAGE_SIZE; 1512 dst_offset = offset_in_page(dst_offset); 1513 src_page += src_offset / PAGE_SIZE; 1514 src_offset = offset_in_page(src_offset); 1515 n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset); 1516 n = min(n, len); 1517 } 1518 1519 dst_addr = kmap_local_page(dst_page) + dst_offset; 1520 src_addr = kmap_local_page(src_page) + src_offset; 1521 1522 memcpy(dst_addr, src_addr, n); 1523 1524 kunmap_local(src_addr); 1525 kunmap_local(dst_addr); 1526 1527 cc->size -= n; 1528 cc->offset += n; 1529 src_offset += n; 1530 len -= n; 1531 copied += n; 1532 } 1533 return copied; 1534 } 1535 1536 static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1537 struct page *src_page, unsigned int src_offset, 1538 size_t len) 1539 { 1540 size_t copied = 0; 1541 int ret = 0; 1542 1543 while (len) { 1544 struct io_copy_cache cc; 1545 struct net_iov *niov; 1546 size_t n; 1547 1548 niov = io_alloc_fallback_niov(ifq); 1549 if (!niov) { 1550 ret = -ENOMEM; 1551 break; 1552 } 1553 1554 cc.page = io_zcrx_iov_page(niov); 1555 cc.offset = 0; 1556 cc.size = PAGE_SIZE; 1557 1558 n = io_copy_page(&cc, src_page, src_offset, len); 1559 1560 if (!io_zcrx_queue_cqe(req, niov, ifq, 0, n)) { 1561 io_zcrx_return_niov(niov); 1562 ret = -ENOSPC; 1563 break; 1564 } 1565 1566 io_zcrx_get_niov_uref(niov); 1567 src_offset += n; 1568 len -= n; 1569 copied += n; 1570 } 1571 1572 return copied ? copied : ret; 1573 } 1574 1575 static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1576 const skb_frag_t *frag, int off, int len) 1577 { 1578 struct page *page = skb_frag_page(frag); 1579 int ret; 1580 1581 ret = io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len); 1582 if (ret > 0) { 1583 if (ifq->notif_stats) { 1584 zcrx_stat_add(&ifq->notif_stats->copy_count, 1); 1585 zcrx_stat_add(&ifq->notif_stats->copy_bytes, ret); 1586 } 1587 zcrx_send_notif(ifq, ZCRX_NOTIF_COPY); 1588 } 1589 1590 return ret; 1591 } 1592 1593 static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1594 const skb_frag_t *frag, int off, int len) 1595 { 1596 struct net_iov *niov; 1597 struct page_pool *pp; 1598 1599 if (unlikely(!skb_frag_is_net_iov(frag))) 1600 return io_zcrx_copy_frag(req, ifq, frag, off, len); 1601 1602 niov = netmem_to_net_iov(frag->netmem); 1603 pp = niov->desc.pp; 1604 1605 if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq) 1606 return -EFAULT; 1607 1608 if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) 1609 return -ENOSPC; 1610 1611 /* 1612 * Prevent it from being recycled while user is accessing it. 1613 * It has to be done before grabbing a user reference. 1614 */ 1615 page_pool_ref_netmem(net_iov_to_netmem(niov)); 1616 io_zcrx_get_niov_uref(niov); 1617 return len; 1618 } 1619 1620 static int 1621 io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, 1622 unsigned int offset, size_t len) 1623 { 1624 struct io_zcrx_args *args = desc->arg.data; 1625 struct io_zcrx_ifq *ifq = args->ifq; 1626 struct io_kiocb *req = args->req; 1627 struct sk_buff *frag_iter; 1628 unsigned start, start_off = offset; 1629 int i, copy, end, off; 1630 int ret = 0; 1631 1632 len = min_t(size_t, len, desc->count); 1633 /* 1634 * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even 1635 * if desc->count is already 0. This is caused by the if (offset + 1 != 1636 * skb->len) check. Return early in this case to break out of 1637 * __tcp_read_sock(). 1638 */ 1639 if (!len) 1640 return 0; 1641 if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) 1642 return -EAGAIN; 1643 1644 if (unlikely(offset < skb_headlen(skb))) { 1645 ssize_t copied; 1646 size_t to_copy; 1647 1648 to_copy = min_t(size_t, skb_headlen(skb) - offset, len); 1649 copied = io_zcrx_copy_chunk(req, ifq, virt_to_page(skb->data), 1650 offset_in_page(skb->data) + offset, 1651 to_copy); 1652 if (copied < 0) { 1653 ret = copied; 1654 goto out; 1655 } 1656 offset += copied; 1657 len -= copied; 1658 if (!len) 1659 goto out; 1660 if (offset != skb_headlen(skb)) 1661 goto out; 1662 } 1663 1664 start = skb_headlen(skb); 1665 1666 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1667 const skb_frag_t *frag; 1668 1669 if (WARN_ON(start > offset + len)) 1670 return -EFAULT; 1671 1672 frag = &skb_shinfo(skb)->frags[i]; 1673 end = start + skb_frag_size(frag); 1674 1675 if (offset < end) { 1676 copy = end - offset; 1677 if (copy > len) 1678 copy = len; 1679 1680 off = offset - start; 1681 ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); 1682 if (ret < 0) 1683 goto out; 1684 1685 offset += ret; 1686 len -= ret; 1687 if (len == 0 || ret != copy) 1688 goto out; 1689 } 1690 start = end; 1691 } 1692 1693 skb_walk_frags(skb, frag_iter) { 1694 if (WARN_ON(start > offset + len)) 1695 return -EFAULT; 1696 1697 end = start + frag_iter->len; 1698 if (offset < end) { 1699 size_t count; 1700 1701 copy = end - offset; 1702 if (copy > len) 1703 copy = len; 1704 1705 off = offset - start; 1706 count = desc->count; 1707 ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); 1708 desc->count = count; 1709 if (ret < 0) 1710 goto out; 1711 1712 offset += ret; 1713 len -= ret; 1714 if (len == 0 || ret != copy) 1715 goto out; 1716 } 1717 start = end; 1718 } 1719 1720 out: 1721 if (offset == start_off) 1722 return ret; 1723 desc->count -= (offset - start_off); 1724 return offset - start_off; 1725 } 1726 1727 static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1728 struct sock *sk, int flags, 1729 unsigned issue_flags, unsigned int *outlen) 1730 { 1731 unsigned int len = *outlen; 1732 struct io_zcrx_args args = { 1733 .req = req, 1734 .ifq = ifq, 1735 }; 1736 read_descriptor_t rd_desc = { 1737 .count = len ? len : UINT_MAX, 1738 .arg.data = &args, 1739 }; 1740 int ret; 1741 1742 lock_sock(sk); 1743 ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); 1744 if (len && ret > 0) 1745 *outlen = len - ret; 1746 if (ret <= 0) { 1747 if (ret < 0 || sock_flag(sk, SOCK_DONE)) 1748 goto out; 1749 if (sk->sk_err) 1750 ret = sock_error(sk); 1751 else if (sk->sk_shutdown & RCV_SHUTDOWN) 1752 goto out; 1753 else if (sk->sk_state == TCP_CLOSE) 1754 ret = -ENOTCONN; 1755 else 1756 ret = -EAGAIN; 1757 } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) && 1758 (issue_flags & IO_URING_F_MULTISHOT)) { 1759 ret = IOU_REQUEUE; 1760 } else if (sock_flag(sk, SOCK_DONE)) { 1761 /* Make it to retry until it finally gets 0. */ 1762 if (issue_flags & IO_URING_F_MULTISHOT) 1763 ret = IOU_REQUEUE; 1764 else 1765 ret = -EAGAIN; 1766 } 1767 out: 1768 release_sock(sk); 1769 return ret; 1770 } 1771 1772 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1773 struct socket *sock, unsigned int flags, 1774 unsigned issue_flags, unsigned int *len) 1775 { 1776 struct sock *sk = sock->sk; 1777 const struct proto *prot = READ_ONCE(sk->sk_prot); 1778 1779 if (prot->recvmsg != tcp_recvmsg) 1780 return -EPROTONOSUPPORT; 1781 1782 sock_rps_record_flow(sk); 1783 return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); 1784 } 1785