16f377873SDavid Wei // SPDX-License-Identifier: GPL-2.0
26f377873SDavid Wei #include <linux/kernel.h>
36f377873SDavid Wei #include <linux/errno.h>
4db070446SPavel Begunkov #include <linux/dma-map-ops.h>
56f377873SDavid Wei #include <linux/mm.h>
634a3e608SPavel Begunkov #include <linux/nospec.h>
76f377873SDavid Wei #include <linux/io_uring.h>
8035af94bSPavel Begunkov #include <linux/netdevice.h>
9035af94bSPavel Begunkov #include <linux/rtnetlink.h>
10bc57c7d3SPavel Begunkov #include <linux/skbuff_ref.h>
116f377873SDavid Wei
1234a3e608SPavel Begunkov #include <net/page_pool/helpers.h>
1334a3e608SPavel Begunkov #include <net/page_pool/memory_provider.h>
1434a3e608SPavel Begunkov #include <net/netlink.h>
15e0793de2SDavid Wei #include <net/netdev_rx_queue.h>
1611ed914bSDavid Wei #include <net/tcp.h>
1711ed914bSDavid Wei #include <net/rps.h>
1834a3e608SPavel Begunkov
19e0793de2SDavid Wei #include <trace/events/page_pool.h>
20e0793de2SDavid Wei
216f377873SDavid Wei #include <uapi/linux/io_uring.h>
226f377873SDavid Wei
236f377873SDavid Wei #include "io_uring.h"
246f377873SDavid Wei #include "kbuf.h"
256f377873SDavid Wei #include "memmap.h"
266f377873SDavid Wei #include "zcrx.h"
27cf96310cSDavid Wei #include "rsrc.h"
286f377873SDavid Wei
29782dfa32SPavel Begunkov #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
30782dfa32SPavel Begunkov
io_pp_to_ifq(struct page_pool * pp)3170e4f9bfSPavel Begunkov static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
3270e4f9bfSPavel Begunkov {
3370e4f9bfSPavel Begunkov return pp->mp_priv;
3470e4f9bfSPavel Begunkov }
3570e4f9bfSPavel Begunkov
io_zcrx_iov_to_area(const struct net_iov * niov)36a79154aeSPavel Begunkov static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
37a79154aeSPavel Begunkov {
38a79154aeSPavel Begunkov struct net_iov_area *owner = net_iov_owner(niov);
39a79154aeSPavel Begunkov
40a79154aeSPavel Begunkov return container_of(owner, struct io_zcrx_area, nia);
41a79154aeSPavel Begunkov }
42a79154aeSPavel Begunkov
io_zcrx_iov_page(const struct net_iov * niov)43a79154aeSPavel Begunkov static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
44a79154aeSPavel Begunkov {
45a79154aeSPavel Begunkov struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
46a79154aeSPavel Begunkov
47782dfa32SPavel Begunkov return area->mem.pages[net_iov_idx(niov)];
48a79154aeSPavel Begunkov }
49a79154aeSPavel Begunkov
io_release_dmabuf(struct io_zcrx_mem * mem)50a5c98e94SPavel Begunkov static void io_release_dmabuf(struct io_zcrx_mem *mem)
51a5c98e94SPavel Begunkov {
52a5c98e94SPavel Begunkov if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
53a5c98e94SPavel Begunkov return;
54a5c98e94SPavel Begunkov
55a5c98e94SPavel Begunkov if (mem->sgt)
56a5c98e94SPavel Begunkov dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt,
57a5c98e94SPavel Begunkov DMA_FROM_DEVICE);
58a5c98e94SPavel Begunkov if (mem->attach)
59a5c98e94SPavel Begunkov dma_buf_detach(mem->dmabuf, mem->attach);
60a5c98e94SPavel Begunkov if (mem->dmabuf)
61a5c98e94SPavel Begunkov dma_buf_put(mem->dmabuf);
62a5c98e94SPavel Begunkov
63a5c98e94SPavel Begunkov mem->sgt = NULL;
64a5c98e94SPavel Begunkov mem->attach = NULL;
65a5c98e94SPavel Begunkov mem->dmabuf = NULL;
66a5c98e94SPavel Begunkov }
67a5c98e94SPavel Begunkov
io_import_dmabuf(struct io_zcrx_ifq * ifq,struct io_zcrx_mem * mem,struct io_uring_zcrx_area_reg * area_reg)68a5c98e94SPavel Begunkov static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
69a5c98e94SPavel Begunkov struct io_zcrx_mem *mem,
70a5c98e94SPavel Begunkov struct io_uring_zcrx_area_reg *area_reg)
71a5c98e94SPavel Begunkov {
72a5c98e94SPavel Begunkov unsigned long off = (unsigned long)area_reg->addr;
73a5c98e94SPavel Begunkov unsigned long len = (unsigned long)area_reg->len;
74a5c98e94SPavel Begunkov unsigned long total_size = 0;
75a5c98e94SPavel Begunkov struct scatterlist *sg;
76a5c98e94SPavel Begunkov int dmabuf_fd = area_reg->dmabuf_fd;
77a5c98e94SPavel Begunkov int i, ret;
78a5c98e94SPavel Begunkov
79a5c98e94SPavel Begunkov if (WARN_ON_ONCE(!ifq->dev))
80a5c98e94SPavel Begunkov return -EFAULT;
81a5c98e94SPavel Begunkov if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
82a5c98e94SPavel Begunkov return -EINVAL;
83a5c98e94SPavel Begunkov
84a5c98e94SPavel Begunkov mem->is_dmabuf = true;
85a5c98e94SPavel Begunkov mem->dmabuf = dma_buf_get(dmabuf_fd);
86a5c98e94SPavel Begunkov if (IS_ERR(mem->dmabuf)) {
87a5c98e94SPavel Begunkov ret = PTR_ERR(mem->dmabuf);
88a5c98e94SPavel Begunkov mem->dmabuf = NULL;
89a5c98e94SPavel Begunkov goto err;
90a5c98e94SPavel Begunkov }
91a5c98e94SPavel Begunkov
92a5c98e94SPavel Begunkov mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev);
93a5c98e94SPavel Begunkov if (IS_ERR(mem->attach)) {
94a5c98e94SPavel Begunkov ret = PTR_ERR(mem->attach);
95a5c98e94SPavel Begunkov mem->attach = NULL;
96a5c98e94SPavel Begunkov goto err;
97a5c98e94SPavel Begunkov }
98a5c98e94SPavel Begunkov
99a5c98e94SPavel Begunkov mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE);
100a5c98e94SPavel Begunkov if (IS_ERR(mem->sgt)) {
101a5c98e94SPavel Begunkov ret = PTR_ERR(mem->sgt);
102a5c98e94SPavel Begunkov mem->sgt = NULL;
103a5c98e94SPavel Begunkov goto err;
104a5c98e94SPavel Begunkov }
105a5c98e94SPavel Begunkov
106a5c98e94SPavel Begunkov for_each_sgtable_dma_sg(mem->sgt, sg, i)
107a5c98e94SPavel Begunkov total_size += sg_dma_len(sg);
108a5c98e94SPavel Begunkov
109*7cac633aSPenglei Jiang if (total_size < off + len) {
110*7cac633aSPenglei Jiang ret = -EINVAL;
111*7cac633aSPenglei Jiang goto err;
112*7cac633aSPenglei Jiang }
113a5c98e94SPavel Begunkov
114a5c98e94SPavel Begunkov mem->dmabuf_offset = off;
115a5c98e94SPavel Begunkov mem->size = len;
116a5c98e94SPavel Begunkov return 0;
117a5c98e94SPavel Begunkov err:
118a5c98e94SPavel Begunkov io_release_dmabuf(mem);
119a5c98e94SPavel Begunkov return ret;
120a5c98e94SPavel Begunkov }
121a5c98e94SPavel Begunkov
io_zcrx_map_area_dmabuf(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)122a5c98e94SPavel Begunkov static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
123a5c98e94SPavel Begunkov {
124a5c98e94SPavel Begunkov unsigned long off = area->mem.dmabuf_offset;
125a5c98e94SPavel Begunkov struct scatterlist *sg;
126a5c98e94SPavel Begunkov unsigned i, niov_idx = 0;
127a5c98e94SPavel Begunkov
128a5c98e94SPavel Begunkov if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
129a5c98e94SPavel Begunkov return -EINVAL;
130a5c98e94SPavel Begunkov
131a5c98e94SPavel Begunkov for_each_sgtable_dma_sg(area->mem.sgt, sg, i) {
132a5c98e94SPavel Begunkov dma_addr_t dma = sg_dma_address(sg);
133a5c98e94SPavel Begunkov unsigned long sg_len = sg_dma_len(sg);
134a5c98e94SPavel Begunkov unsigned long sg_off = min(sg_len, off);
135a5c98e94SPavel Begunkov
136a5c98e94SPavel Begunkov off -= sg_off;
137a5c98e94SPavel Begunkov sg_len -= sg_off;
138a5c98e94SPavel Begunkov dma += sg_off;
139a5c98e94SPavel Begunkov
140a5c98e94SPavel Begunkov while (sg_len && niov_idx < area->nia.num_niovs) {
141a5c98e94SPavel Begunkov struct net_iov *niov = &area->nia.niovs[niov_idx];
142a5c98e94SPavel Begunkov
143a5c98e94SPavel Begunkov if (net_mp_niov_set_dma_addr(niov, dma))
144a5c98e94SPavel Begunkov return 0;
145a5c98e94SPavel Begunkov sg_len -= PAGE_SIZE;
146a5c98e94SPavel Begunkov dma += PAGE_SIZE;
147a5c98e94SPavel Begunkov niov_idx++;
148a5c98e94SPavel Begunkov }
149a5c98e94SPavel Begunkov }
150a5c98e94SPavel Begunkov return niov_idx;
151a5c98e94SPavel Begunkov }
152a5c98e94SPavel Begunkov
io_import_umem(struct io_zcrx_ifq * ifq,struct io_zcrx_mem * mem,struct io_uring_zcrx_area_reg * area_reg)153a5c98e94SPavel Begunkov static int io_import_umem(struct io_zcrx_ifq *ifq,
154a5c98e94SPavel Begunkov struct io_zcrx_mem *mem,
155a5c98e94SPavel Begunkov struct io_uring_zcrx_area_reg *area_reg)
156a5c98e94SPavel Begunkov {
157a5c98e94SPavel Begunkov struct page **pages;
158a5c98e94SPavel Begunkov int nr_pages;
159a5c98e94SPavel Begunkov
160a5c98e94SPavel Begunkov if (area_reg->dmabuf_fd)
161a5c98e94SPavel Begunkov return -EINVAL;
162a5c98e94SPavel Begunkov if (!area_reg->addr)
163a5c98e94SPavel Begunkov return -EFAULT;
164a5c98e94SPavel Begunkov pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
165a5c98e94SPavel Begunkov &nr_pages);
166a5c98e94SPavel Begunkov if (IS_ERR(pages))
167a5c98e94SPavel Begunkov return PTR_ERR(pages);
168a5c98e94SPavel Begunkov
169a5c98e94SPavel Begunkov mem->pages = pages;
170a5c98e94SPavel Begunkov mem->nr_folios = nr_pages;
171a5c98e94SPavel Begunkov mem->size = area_reg->len;
172a5c98e94SPavel Begunkov return 0;
173a5c98e94SPavel Begunkov }
174a5c98e94SPavel Begunkov
io_release_area_mem(struct io_zcrx_mem * mem)175782dfa32SPavel Begunkov static void io_release_area_mem(struct io_zcrx_mem *mem)
176782dfa32SPavel Begunkov {
177a5c98e94SPavel Begunkov if (mem->is_dmabuf) {
178a5c98e94SPavel Begunkov io_release_dmabuf(mem);
179a5c98e94SPavel Begunkov return;
180a5c98e94SPavel Begunkov }
181782dfa32SPavel Begunkov if (mem->pages) {
182782dfa32SPavel Begunkov unpin_user_pages(mem->pages, mem->nr_folios);
183782dfa32SPavel Begunkov kvfree(mem->pages);
184782dfa32SPavel Begunkov }
185782dfa32SPavel Begunkov }
186782dfa32SPavel Begunkov
io_import_area(struct io_zcrx_ifq * ifq,struct io_zcrx_mem * mem,struct io_uring_zcrx_area_reg * area_reg)187782dfa32SPavel Begunkov static int io_import_area(struct io_zcrx_ifq *ifq,
188782dfa32SPavel Begunkov struct io_zcrx_mem *mem,
189782dfa32SPavel Begunkov struct io_uring_zcrx_area_reg *area_reg)
190782dfa32SPavel Begunkov {
191782dfa32SPavel Begunkov int ret;
192782dfa32SPavel Begunkov
193782dfa32SPavel Begunkov ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
194782dfa32SPavel Begunkov if (ret)
195782dfa32SPavel Begunkov return ret;
196782dfa32SPavel Begunkov if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
197782dfa32SPavel Begunkov return -EINVAL;
198782dfa32SPavel Begunkov
199a5c98e94SPavel Begunkov if (area_reg->flags & IORING_ZCRX_AREA_DMABUF)
200a5c98e94SPavel Begunkov return io_import_dmabuf(ifq, mem, area_reg);
201a5c98e94SPavel Begunkov return io_import_umem(ifq, mem, area_reg);
202782dfa32SPavel Begunkov }
203db070446SPavel Begunkov
io_zcrx_unmap_umem(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area,int nr_mapped)2048a628042SPavel Begunkov static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq,
205db070446SPavel Begunkov struct io_zcrx_area *area, int nr_mapped)
206db070446SPavel Begunkov {
207db070446SPavel Begunkov int i;
208db070446SPavel Begunkov
209db070446SPavel Begunkov for (i = 0; i < nr_mapped; i++) {
2108a628042SPavel Begunkov netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]);
2118a628042SPavel Begunkov dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem);
212db070446SPavel Begunkov
213db070446SPavel Begunkov dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
214db070446SPavel Begunkov DMA_FROM_DEVICE, IO_DMA_ATTR);
215db070446SPavel Begunkov }
216db070446SPavel Begunkov }
217db070446SPavel Begunkov
__io_zcrx_unmap_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area,int nr_mapped)2188a628042SPavel Begunkov static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
2198a628042SPavel Begunkov struct io_zcrx_area *area, int nr_mapped)
2208a628042SPavel Begunkov {
2218a628042SPavel Begunkov int i;
2228a628042SPavel Begunkov
223a5c98e94SPavel Begunkov if (area->mem.is_dmabuf)
224a5c98e94SPavel Begunkov io_release_dmabuf(&area->mem);
225a5c98e94SPavel Begunkov else
2268a628042SPavel Begunkov io_zcrx_unmap_umem(ifq, area, nr_mapped);
2278a628042SPavel Begunkov
2288a628042SPavel Begunkov for (i = 0; i < area->nia.num_niovs; i++)
2298a628042SPavel Begunkov net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
2308a628042SPavel Begunkov }
2318a628042SPavel Begunkov
io_zcrx_unmap_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)232db070446SPavel Begunkov static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
233db070446SPavel Begunkov {
234f12ecf5eSPavel Begunkov guard(mutex)(&ifq->dma_lock);
235f12ecf5eSPavel Begunkov
236db070446SPavel Begunkov if (area->is_mapped)
237db070446SPavel Begunkov __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs);
238f12ecf5eSPavel Begunkov area->is_mapped = false;
239db070446SPavel Begunkov }
240db070446SPavel Begunkov
io_zcrx_map_area_umem(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)2418a628042SPavel Begunkov static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
242db070446SPavel Begunkov {
243db070446SPavel Begunkov int i;
244db070446SPavel Begunkov
245db070446SPavel Begunkov for (i = 0; i < area->nia.num_niovs; i++) {
246db070446SPavel Begunkov struct net_iov *niov = &area->nia.niovs[i];
247db070446SPavel Begunkov dma_addr_t dma;
248db070446SPavel Begunkov
249782dfa32SPavel Begunkov dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0,
250782dfa32SPavel Begunkov PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR);
251db070446SPavel Begunkov if (dma_mapping_error(ifq->dev, dma))
252db070446SPavel Begunkov break;
253db070446SPavel Begunkov if (net_mp_niov_set_dma_addr(niov, dma)) {
254db070446SPavel Begunkov dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
255db070446SPavel Begunkov DMA_FROM_DEVICE, IO_DMA_ATTR);
256db070446SPavel Begunkov break;
257db070446SPavel Begunkov }
258db070446SPavel Begunkov }
2598a628042SPavel Begunkov return i;
2608a628042SPavel Begunkov }
261db070446SPavel Begunkov
io_zcrx_map_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)2628a628042SPavel Begunkov static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
2638a628042SPavel Begunkov {
2648a628042SPavel Begunkov unsigned nr;
2658a628042SPavel Begunkov
2668a628042SPavel Begunkov guard(mutex)(&ifq->dma_lock);
2678a628042SPavel Begunkov if (area->is_mapped)
2688a628042SPavel Begunkov return 0;
2698a628042SPavel Begunkov
270a5c98e94SPavel Begunkov if (area->mem.is_dmabuf)
271a5c98e94SPavel Begunkov nr = io_zcrx_map_area_dmabuf(ifq, area);
272a5c98e94SPavel Begunkov else
2738a628042SPavel Begunkov nr = io_zcrx_map_area_umem(ifq, area);
274a5c98e94SPavel Begunkov
2758a628042SPavel Begunkov if (nr != area->nia.num_niovs) {
2768a628042SPavel Begunkov __io_zcrx_unmap_area(ifq, area, nr);
277db070446SPavel Begunkov return -EINVAL;
278db070446SPavel Begunkov }
279db070446SPavel Begunkov
280db070446SPavel Begunkov area->is_mapped = true;
281db070446SPavel Begunkov return 0;
282db070446SPavel Begunkov }
283db070446SPavel Begunkov
io_zcrx_sync_for_device(const struct page_pool * pool,struct net_iov * niov)284db070446SPavel Begunkov static void io_zcrx_sync_for_device(const struct page_pool *pool,
285db070446SPavel Begunkov struct net_iov *niov)
286db070446SPavel Begunkov {
287db070446SPavel Begunkov #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
288db070446SPavel Begunkov dma_addr_t dma_addr;
289db070446SPavel Begunkov
290db070446SPavel Begunkov if (!dma_dev_need_sync(pool->p.dev))
291db070446SPavel Begunkov return;
292db070446SPavel Begunkov
293db070446SPavel Begunkov dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
294db070446SPavel Begunkov __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
295db070446SPavel Begunkov PAGE_SIZE, pool->p.dma_dir);
296db070446SPavel Begunkov #endif
297db070446SPavel Begunkov }
298db070446SPavel Begunkov
2996f377873SDavid Wei #define IO_RQ_MAX_ENTRIES 32768
3006f377873SDavid Wei
301931dfae1SPavel Begunkov #define IO_SKBS_PER_CALL_LIMIT 20
302931dfae1SPavel Begunkov
30311ed914bSDavid Wei struct io_zcrx_args {
30411ed914bSDavid Wei struct io_kiocb *req;
30511ed914bSDavid Wei struct io_zcrx_ifq *ifq;
30611ed914bSDavid Wei struct socket *sock;
307931dfae1SPavel Begunkov unsigned nr_skbs;
30811ed914bSDavid Wei };
30911ed914bSDavid Wei
31034a3e608SPavel Begunkov static const struct memory_provider_ops io_uring_pp_zc_ops;
31134a3e608SPavel Begunkov
io_get_user_counter(struct net_iov * niov)31234a3e608SPavel Begunkov static inline atomic_t *io_get_user_counter(struct net_iov *niov)
31334a3e608SPavel Begunkov {
31434a3e608SPavel Begunkov struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
31534a3e608SPavel Begunkov
31634a3e608SPavel Begunkov return &area->user_refs[net_iov_idx(niov)];
31734a3e608SPavel Begunkov }
31834a3e608SPavel Begunkov
io_zcrx_put_niov_uref(struct net_iov * niov)31934a3e608SPavel Begunkov static bool io_zcrx_put_niov_uref(struct net_iov *niov)
32034a3e608SPavel Begunkov {
32134a3e608SPavel Begunkov atomic_t *uref = io_get_user_counter(niov);
32234a3e608SPavel Begunkov
32334a3e608SPavel Begunkov if (unlikely(!atomic_read(uref)))
32434a3e608SPavel Begunkov return false;
32534a3e608SPavel Begunkov atomic_dec(uref);
32634a3e608SPavel Begunkov return true;
32734a3e608SPavel Begunkov }
32834a3e608SPavel Begunkov
io_zcrx_get_niov_uref(struct net_iov * niov)32911ed914bSDavid Wei static void io_zcrx_get_niov_uref(struct net_iov *niov)
33011ed914bSDavid Wei {
33111ed914bSDavid Wei atomic_inc(io_get_user_counter(niov));
33211ed914bSDavid Wei }
33311ed914bSDavid Wei
io_allocate_rbuf_ring(struct io_zcrx_ifq * ifq,struct io_uring_zcrx_ifq_reg * reg,struct io_uring_region_desc * rd,u32 id)3346f377873SDavid Wei static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
3356f377873SDavid Wei struct io_uring_zcrx_ifq_reg *reg,
33676f1cc98SPavel Begunkov struct io_uring_region_desc *rd,
33776f1cc98SPavel Begunkov u32 id)
3386f377873SDavid Wei {
33976f1cc98SPavel Begunkov u64 mmap_offset;
3406f377873SDavid Wei size_t off, size;
3416f377873SDavid Wei void *ptr;
3426f377873SDavid Wei int ret;
3436f377873SDavid Wei
3446f377873SDavid Wei off = sizeof(struct io_uring);
3456f377873SDavid Wei size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
3466f377873SDavid Wei if (size > rd->size)
3476f377873SDavid Wei return -EINVAL;
3486f377873SDavid Wei
34976f1cc98SPavel Begunkov mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
35076f1cc98SPavel Begunkov mmap_offset += id << IORING_OFF_PBUF_SHIFT;
35176f1cc98SPavel Begunkov
35276f1cc98SPavel Begunkov ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset);
3536f377873SDavid Wei if (ret < 0)
3546f377873SDavid Wei return ret;
3556f377873SDavid Wei
356632b3186SPavel Begunkov ptr = io_region_get_ptr(&ifq->region);
3576f377873SDavid Wei ifq->rq_ring = (struct io_uring *)ptr;
3586f377873SDavid Wei ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
3596f377873SDavid Wei return 0;
3606f377873SDavid Wei }
3616f377873SDavid Wei
io_free_rbuf_ring(struct io_zcrx_ifq * ifq)3626f377873SDavid Wei static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
3636f377873SDavid Wei {
364632b3186SPavel Begunkov io_free_region(ifq->ctx, &ifq->region);
3656f377873SDavid Wei ifq->rq_ring = NULL;
3666f377873SDavid Wei ifq->rqes = NULL;
3676f377873SDavid Wei }
3686f377873SDavid Wei
io_zcrx_free_area(struct io_zcrx_area * area)369cf96310cSDavid Wei static void io_zcrx_free_area(struct io_zcrx_area *area)
370cf96310cSDavid Wei {
3710ec33c81SPavel Begunkov if (area->ifq)
372db070446SPavel Begunkov io_zcrx_unmap_area(area->ifq, area);
373782dfa32SPavel Begunkov io_release_area_mem(&area->mem);
374db070446SPavel Begunkov
375cf96310cSDavid Wei kvfree(area->freelist);
376cf96310cSDavid Wei kvfree(area->nia.niovs);
37734a3e608SPavel Begunkov kvfree(area->user_refs);
378cf96310cSDavid Wei kfree(area);
379cf96310cSDavid Wei }
380cf96310cSDavid Wei
381a5c98e94SPavel Begunkov #define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF)
382a5c98e94SPavel Begunkov
io_zcrx_create_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area ** res,struct io_uring_zcrx_area_reg * area_reg)383cf96310cSDavid Wei static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
384cf96310cSDavid Wei struct io_zcrx_area **res,
385cf96310cSDavid Wei struct io_uring_zcrx_area_reg *area_reg)
386cf96310cSDavid Wei {
387cf96310cSDavid Wei struct io_zcrx_area *area;
388782dfa32SPavel Begunkov unsigned nr_iovs;
389782dfa32SPavel Begunkov int i, ret;
390cf96310cSDavid Wei
391a5c98e94SPavel Begunkov if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS)
392cf96310cSDavid Wei return -EINVAL;
393a5c98e94SPavel Begunkov if (area_reg->rq_area_token)
394a5c98e94SPavel Begunkov return -EINVAL;
395a5c98e94SPavel Begunkov if (area_reg->__resv2[0] || area_reg->__resv2[1])
396cf96310cSDavid Wei return -EINVAL;
397cf96310cSDavid Wei
398cf96310cSDavid Wei ret = -ENOMEM;
399cf96310cSDavid Wei area = kzalloc(sizeof(*area), GFP_KERNEL);
400cf96310cSDavid Wei if (!area)
401cf96310cSDavid Wei goto err;
402cf96310cSDavid Wei
403782dfa32SPavel Begunkov ret = io_import_area(ifq, &area->mem, area_reg);
404782dfa32SPavel Begunkov if (ret)
405cf96310cSDavid Wei goto err;
406782dfa32SPavel Begunkov
407782dfa32SPavel Begunkov nr_iovs = area->mem.size >> PAGE_SHIFT;
4085a17131aSPavel Begunkov area->nia.num_niovs = nr_iovs;
409cf96310cSDavid Wei
410782dfa32SPavel Begunkov ret = -ENOMEM;
4115a17131aSPavel Begunkov area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
412cf96310cSDavid Wei GFP_KERNEL | __GFP_ZERO);
413cf96310cSDavid Wei if (!area->nia.niovs)
414cf96310cSDavid Wei goto err;
415cf96310cSDavid Wei
4165a17131aSPavel Begunkov area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]),
417cf96310cSDavid Wei GFP_KERNEL | __GFP_ZERO);
418cf96310cSDavid Wei if (!area->freelist)
419cf96310cSDavid Wei goto err;
420cf96310cSDavid Wei
4215a17131aSPavel Begunkov area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]),
42234a3e608SPavel Begunkov GFP_KERNEL | __GFP_ZERO);
42334a3e608SPavel Begunkov if (!area->user_refs)
42434a3e608SPavel Begunkov goto err;
42534a3e608SPavel Begunkov
4265a17131aSPavel Begunkov for (i = 0; i < nr_iovs; i++) {
42734a3e608SPavel Begunkov struct net_iov *niov = &area->nia.niovs[i];
42834a3e608SPavel Begunkov
42934a3e608SPavel Begunkov niov->owner = &area->nia;
43034a3e608SPavel Begunkov area->freelist[i] = i;
43134a3e608SPavel Begunkov atomic_set(&area->user_refs[i], 0);
43203e96b8cSMina Almasry niov->type = NET_IOV_IOURING;
43334a3e608SPavel Begunkov }
43434a3e608SPavel Begunkov
4355a17131aSPavel Begunkov area->free_count = nr_iovs;
436cf96310cSDavid Wei area->ifq = ifq;
437cf96310cSDavid Wei /* we're only supporting one area per ifq for now */
438cf96310cSDavid Wei area->area_id = 0;
439cf96310cSDavid Wei area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
440cf96310cSDavid Wei spin_lock_init(&area->freelist_lock);
441cf96310cSDavid Wei *res = area;
442cf96310cSDavid Wei return 0;
443cf96310cSDavid Wei err:
444cf96310cSDavid Wei if (area)
445cf96310cSDavid Wei io_zcrx_free_area(area);
446cf96310cSDavid Wei return ret;
447cf96310cSDavid Wei }
448cf96310cSDavid Wei
io_zcrx_ifq_alloc(struct io_ring_ctx * ctx)4496f377873SDavid Wei static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
4506f377873SDavid Wei {
4516f377873SDavid Wei struct io_zcrx_ifq *ifq;
4526f377873SDavid Wei
4536f377873SDavid Wei ifq = kzalloc(sizeof(*ifq), GFP_KERNEL);
4546f377873SDavid Wei if (!ifq)
4556f377873SDavid Wei return NULL;
4566f377873SDavid Wei
4576f377873SDavid Wei ifq->if_rxq = -1;
4586f377873SDavid Wei ifq->ctx = ctx;
459035af94bSPavel Begunkov spin_lock_init(&ifq->lock);
46034a3e608SPavel Begunkov spin_lock_init(&ifq->rq_lock);
461f12ecf5eSPavel Begunkov mutex_init(&ifq->dma_lock);
4626f377873SDavid Wei return ifq;
4636f377873SDavid Wei }
4646f377873SDavid Wei
io_zcrx_drop_netdev(struct io_zcrx_ifq * ifq)465035af94bSPavel Begunkov static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq)
466035af94bSPavel Begunkov {
467035af94bSPavel Begunkov spin_lock(&ifq->lock);
468035af94bSPavel Begunkov if (ifq->netdev) {
469035af94bSPavel Begunkov netdev_put(ifq->netdev, &ifq->netdev_tracker);
470035af94bSPavel Begunkov ifq->netdev = NULL;
471035af94bSPavel Begunkov }
472035af94bSPavel Begunkov spin_unlock(&ifq->lock);
473035af94bSPavel Begunkov }
474035af94bSPavel Begunkov
io_close_queue(struct io_zcrx_ifq * ifq)475e0793de2SDavid Wei static void io_close_queue(struct io_zcrx_ifq *ifq)
476e0793de2SDavid Wei {
477e0793de2SDavid Wei struct net_device *netdev;
478e0793de2SDavid Wei netdevice_tracker netdev_tracker;
479e0793de2SDavid Wei struct pp_memory_provider_params p = {
480e0793de2SDavid Wei .mp_ops = &io_uring_pp_zc_ops,
481e0793de2SDavid Wei .mp_priv = ifq,
482e0793de2SDavid Wei };
483e0793de2SDavid Wei
484e0793de2SDavid Wei if (ifq->if_rxq == -1)
485e0793de2SDavid Wei return;
486e0793de2SDavid Wei
487e0793de2SDavid Wei spin_lock(&ifq->lock);
488e0793de2SDavid Wei netdev = ifq->netdev;
489e0793de2SDavid Wei netdev_tracker = ifq->netdev_tracker;
490e0793de2SDavid Wei ifq->netdev = NULL;
491e0793de2SDavid Wei spin_unlock(&ifq->lock);
492e0793de2SDavid Wei
493e0793de2SDavid Wei if (netdev) {
494e0793de2SDavid Wei net_mp_close_rxq(netdev, ifq->if_rxq, &p);
495e0793de2SDavid Wei netdev_put(netdev, &netdev_tracker);
496e0793de2SDavid Wei }
497e0793de2SDavid Wei ifq->if_rxq = -1;
498e0793de2SDavid Wei }
499e0793de2SDavid Wei
io_zcrx_ifq_free(struct io_zcrx_ifq * ifq)5006f377873SDavid Wei static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
5016f377873SDavid Wei {
502e0793de2SDavid Wei io_close_queue(ifq);
503035af94bSPavel Begunkov io_zcrx_drop_netdev(ifq);
504035af94bSPavel Begunkov
505cf96310cSDavid Wei if (ifq->area)
506cf96310cSDavid Wei io_zcrx_free_area(ifq->area);
507035af94bSPavel Begunkov if (ifq->dev)
508035af94bSPavel Begunkov put_device(ifq->dev);
509cf96310cSDavid Wei
5106f377873SDavid Wei io_free_rbuf_ring(ifq);
511f12ecf5eSPavel Begunkov mutex_destroy(&ifq->dma_lock);
5126f377873SDavid Wei kfree(ifq);
5136f377873SDavid Wei }
5146f377873SDavid Wei
io_zcrx_get_region(struct io_ring_ctx * ctx,unsigned int id)51577231d4eSPavel Begunkov struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
51677231d4eSPavel Begunkov unsigned int id)
51777231d4eSPavel Begunkov {
51876f1cc98SPavel Begunkov struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id);
51976f1cc98SPavel Begunkov
52077231d4eSPavel Begunkov lockdep_assert_held(&ctx->mmap_lock);
52177231d4eSPavel Begunkov
52276f1cc98SPavel Begunkov return ifq ? &ifq->region : NULL;
52377231d4eSPavel Begunkov }
52477231d4eSPavel Begunkov
io_register_zcrx_ifq(struct io_ring_ctx * ctx,struct io_uring_zcrx_ifq_reg __user * arg)5256f377873SDavid Wei int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
5266f377873SDavid Wei struct io_uring_zcrx_ifq_reg __user *arg)
5276f377873SDavid Wei {
528e0793de2SDavid Wei struct pp_memory_provider_params mp_param = {};
529cf96310cSDavid Wei struct io_uring_zcrx_area_reg area;
5306f377873SDavid Wei struct io_uring_zcrx_ifq_reg reg;
5316f377873SDavid Wei struct io_uring_region_desc rd;
5326f377873SDavid Wei struct io_zcrx_ifq *ifq;
5336f377873SDavid Wei int ret;
53476f1cc98SPavel Begunkov u32 id;
5356f377873SDavid Wei
5366f377873SDavid Wei /*
5376f377873SDavid Wei * 1. Interface queue allocation.
5386f377873SDavid Wei * 2. It can observe data destined for sockets of other tasks.
5396f377873SDavid Wei */
5406f377873SDavid Wei if (!capable(CAP_NET_ADMIN))
5416f377873SDavid Wei return -EPERM;
5426f377873SDavid Wei
5436f377873SDavid Wei /* mandatory io_uring features for zc rx */
5446f377873SDavid Wei if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
5456f377873SDavid Wei ctx->flags & IORING_SETUP_CQE32))
5466f377873SDavid Wei return -EINVAL;
5476f377873SDavid Wei if (copy_from_user(®, arg, sizeof(reg)))
5486f377873SDavid Wei return -EFAULT;
5496f377873SDavid Wei if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
5506f377873SDavid Wei return -EFAULT;
55125744f84SPavel Begunkov if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)) ||
55225744f84SPavel Begunkov reg.__resv2 || reg.zcrx_id)
5536f377873SDavid Wei return -EINVAL;
5546f377873SDavid Wei if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
5556f377873SDavid Wei return -EINVAL;
5566f377873SDavid Wei if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
5576f377873SDavid Wei if (!(ctx->flags & IORING_SETUP_CLAMP))
5586f377873SDavid Wei return -EINVAL;
5596f377873SDavid Wei reg.rq_entries = IO_RQ_MAX_ENTRIES;
5606f377873SDavid Wei }
5616f377873SDavid Wei reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
5626f377873SDavid Wei
563cf96310cSDavid Wei if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area)))
5646f377873SDavid Wei return -EFAULT;
5656f377873SDavid Wei
5666f377873SDavid Wei ifq = io_zcrx_ifq_alloc(ctx);
5676f377873SDavid Wei if (!ifq)
5686f377873SDavid Wei return -ENOMEM;
5696c9589aaSPavel Begunkov ifq->rq_entries = reg.rq_entries;
5706f377873SDavid Wei
57176f1cc98SPavel Begunkov scoped_guard(mutex, &ctx->mmap_lock) {
57276f1cc98SPavel Begunkov /* preallocate id */
57376f1cc98SPavel Begunkov ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
57476f1cc98SPavel Begunkov if (ret)
57576f1cc98SPavel Begunkov goto ifq_free;
57676f1cc98SPavel Begunkov }
57776f1cc98SPavel Begunkov
57876f1cc98SPavel Begunkov ret = io_allocate_rbuf_ring(ifq, ®, &rd, id);
5796f377873SDavid Wei if (ret)
5806f377873SDavid Wei goto err;
5816f377873SDavid Wei
5826c9589aaSPavel Begunkov ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx,
5836c9589aaSPavel Begunkov &ifq->netdev_tracker, GFP_KERNEL);
5846c9589aaSPavel Begunkov if (!ifq->netdev) {
5856c9589aaSPavel Begunkov ret = -ENODEV;
5866c9589aaSPavel Begunkov goto err;
5876c9589aaSPavel Begunkov }
5886c9589aaSPavel Begunkov
5896c9589aaSPavel Begunkov ifq->dev = ifq->netdev->dev.parent;
5906c9589aaSPavel Begunkov if (!ifq->dev) {
5916c9589aaSPavel Begunkov ret = -EOPNOTSUPP;
5926c9589aaSPavel Begunkov goto err;
5936c9589aaSPavel Begunkov }
5946c9589aaSPavel Begunkov get_device(ifq->dev);
5956c9589aaSPavel Begunkov
596cf96310cSDavid Wei ret = io_zcrx_create_area(ifq, &ifq->area, &area);
597cf96310cSDavid Wei if (ret)
598cf96310cSDavid Wei goto err;
599cf96310cSDavid Wei
600e0793de2SDavid Wei mp_param.mp_ops = &io_uring_pp_zc_ops;
601e0793de2SDavid Wei mp_param.mp_priv = ifq;
602e0793de2SDavid Wei ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);
603e0793de2SDavid Wei if (ret)
604e0793de2SDavid Wei goto err;
605e0793de2SDavid Wei ifq->if_rxq = reg.if_rxq;
606e0793de2SDavid Wei
6076f377873SDavid Wei reg.offsets.rqes = sizeof(struct io_uring);
6086f377873SDavid Wei reg.offsets.head = offsetof(struct io_uring, head);
6096f377873SDavid Wei reg.offsets.tail = offsetof(struct io_uring, tail);
61076f1cc98SPavel Begunkov reg.zcrx_id = id;
61176f1cc98SPavel Begunkov
61276f1cc98SPavel Begunkov scoped_guard(mutex, &ctx->mmap_lock) {
61376f1cc98SPavel Begunkov /* publish ifq */
61476f1cc98SPavel Begunkov ret = -ENOMEM;
61576f1cc98SPavel Begunkov if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
61676f1cc98SPavel Begunkov goto err;
61776f1cc98SPavel Begunkov }
6186f377873SDavid Wei
6196f377873SDavid Wei if (copy_to_user(arg, ®, sizeof(reg)) ||
620e0793de2SDavid Wei copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
621e0793de2SDavid Wei copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) {
622cf96310cSDavid Wei ret = -EFAULT;
623cf96310cSDavid Wei goto err;
624cf96310cSDavid Wei }
6256f377873SDavid Wei return 0;
6266f377873SDavid Wei err:
62776f1cc98SPavel Begunkov scoped_guard(mutex, &ctx->mmap_lock)
62876f1cc98SPavel Begunkov xa_erase(&ctx->zcrx_ctxs, id);
62976f1cc98SPavel Begunkov ifq_free:
6306f377873SDavid Wei io_zcrx_ifq_free(ifq);
6316f377873SDavid Wei return ret;
6326f377873SDavid Wei }
6336f377873SDavid Wei
io_unregister_zcrx_ifqs(struct io_ring_ctx * ctx)6346f377873SDavid Wei void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
6356f377873SDavid Wei {
63676f1cc98SPavel Begunkov struct io_zcrx_ifq *ifq;
6376f377873SDavid Wei
6386f377873SDavid Wei lockdep_assert_held(&ctx->uring_lock);
6396f377873SDavid Wei
64076f1cc98SPavel Begunkov while (1) {
64176f1cc98SPavel Begunkov scoped_guard(mutex, &ctx->mmap_lock) {
642eda4623cSPavel Begunkov unsigned long id = 0;
643eda4623cSPavel Begunkov
64476f1cc98SPavel Begunkov ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
64576f1cc98SPavel Begunkov if (ifq)
64676f1cc98SPavel Begunkov xa_erase(&ctx->zcrx_ctxs, id);
64776f1cc98SPavel Begunkov }
6486f377873SDavid Wei if (!ifq)
64976f1cc98SPavel Begunkov break;
6506f377873SDavid Wei io_zcrx_ifq_free(ifq);
6516f377873SDavid Wei }
6526f377873SDavid Wei
65376f1cc98SPavel Begunkov xa_destroy(&ctx->zcrx_ctxs);
65476f1cc98SPavel Begunkov }
65576f1cc98SPavel Begunkov
__io_zcrx_get_free_niov(struct io_zcrx_area * area)65634a3e608SPavel Begunkov static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
65734a3e608SPavel Begunkov {
65834a3e608SPavel Begunkov unsigned niov_idx;
65934a3e608SPavel Begunkov
66034a3e608SPavel Begunkov lockdep_assert_held(&area->freelist_lock);
66134a3e608SPavel Begunkov
66234a3e608SPavel Begunkov niov_idx = area->freelist[--area->free_count];
66334a3e608SPavel Begunkov return &area->nia.niovs[niov_idx];
66434a3e608SPavel Begunkov }
66534a3e608SPavel Begunkov
io_zcrx_return_niov_freelist(struct net_iov * niov)66634a3e608SPavel Begunkov static void io_zcrx_return_niov_freelist(struct net_iov *niov)
66734a3e608SPavel Begunkov {
66834a3e608SPavel Begunkov struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
66934a3e608SPavel Begunkov
67034a3e608SPavel Begunkov spin_lock_bh(&area->freelist_lock);
67134a3e608SPavel Begunkov area->freelist[area->free_count++] = net_iov_idx(niov);
67234a3e608SPavel Begunkov spin_unlock_bh(&area->freelist_lock);
67334a3e608SPavel Begunkov }
67434a3e608SPavel Begunkov
io_zcrx_return_niov(struct net_iov * niov)67534a3e608SPavel Begunkov static void io_zcrx_return_niov(struct net_iov *niov)
67634a3e608SPavel Begunkov {
67734a3e608SPavel Begunkov netmem_ref netmem = net_iov_to_netmem(niov);
67834a3e608SPavel Begunkov
679bc57c7d3SPavel Begunkov if (!niov->pp) {
680bc57c7d3SPavel Begunkov /* copy fallback allocated niovs */
681bc57c7d3SPavel Begunkov io_zcrx_return_niov_freelist(niov);
682bc57c7d3SPavel Begunkov return;
683bc57c7d3SPavel Begunkov }
68434a3e608SPavel Begunkov page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false);
68534a3e608SPavel Begunkov }
68634a3e608SPavel Begunkov
io_zcrx_scrub(struct io_zcrx_ifq * ifq)68734a3e608SPavel Begunkov static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
68834a3e608SPavel Begunkov {
68934a3e608SPavel Begunkov struct io_zcrx_area *area = ifq->area;
69034a3e608SPavel Begunkov int i;
69134a3e608SPavel Begunkov
69234a3e608SPavel Begunkov if (!area)
69334a3e608SPavel Begunkov return;
69434a3e608SPavel Begunkov
69534a3e608SPavel Begunkov /* Reclaim back all buffers given to the user space. */
69634a3e608SPavel Begunkov for (i = 0; i < area->nia.num_niovs; i++) {
69734a3e608SPavel Begunkov struct net_iov *niov = &area->nia.niovs[i];
69834a3e608SPavel Begunkov int nr;
69934a3e608SPavel Begunkov
70034a3e608SPavel Begunkov if (!atomic_read(io_get_user_counter(niov)))
70134a3e608SPavel Begunkov continue;
70234a3e608SPavel Begunkov nr = atomic_xchg(io_get_user_counter(niov), 0);
70334a3e608SPavel Begunkov if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
70434a3e608SPavel Begunkov io_zcrx_return_niov(niov);
70534a3e608SPavel Begunkov }
70634a3e608SPavel Begunkov }
70734a3e608SPavel Begunkov
io_shutdown_zcrx_ifqs(struct io_ring_ctx * ctx)7086f377873SDavid Wei void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
7096f377873SDavid Wei {
71076f1cc98SPavel Begunkov struct io_zcrx_ifq *ifq;
71176f1cc98SPavel Begunkov unsigned long index;
71276f1cc98SPavel Begunkov
7136f377873SDavid Wei lockdep_assert_held(&ctx->uring_lock);
71434a3e608SPavel Begunkov
71576f1cc98SPavel Begunkov xa_for_each(&ctx->zcrx_ctxs, index, ifq) {
71676f1cc98SPavel Begunkov io_zcrx_scrub(ifq);
71776f1cc98SPavel Begunkov io_close_queue(ifq);
71876f1cc98SPavel Begunkov }
7196f377873SDavid Wei }
72034a3e608SPavel Begunkov
io_zcrx_rqring_entries(struct io_zcrx_ifq * ifq)72134a3e608SPavel Begunkov static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
72234a3e608SPavel Begunkov {
72334a3e608SPavel Begunkov u32 entries;
72434a3e608SPavel Begunkov
72534a3e608SPavel Begunkov entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
72634a3e608SPavel Begunkov return min(entries, ifq->rq_entries);
72734a3e608SPavel Begunkov }
72834a3e608SPavel Begunkov
io_zcrx_get_rqe(struct io_zcrx_ifq * ifq,unsigned mask)72934a3e608SPavel Begunkov static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
73034a3e608SPavel Begunkov unsigned mask)
73134a3e608SPavel Begunkov {
73234a3e608SPavel Begunkov unsigned int idx = ifq->cached_rq_head++ & mask;
73334a3e608SPavel Begunkov
73434a3e608SPavel Begunkov return &ifq->rqes[idx];
73534a3e608SPavel Begunkov }
73634a3e608SPavel Begunkov
io_zcrx_ring_refill(struct page_pool * pp,struct io_zcrx_ifq * ifq)73734a3e608SPavel Begunkov static void io_zcrx_ring_refill(struct page_pool *pp,
73834a3e608SPavel Begunkov struct io_zcrx_ifq *ifq)
73934a3e608SPavel Begunkov {
74034a3e608SPavel Begunkov unsigned int mask = ifq->rq_entries - 1;
74134a3e608SPavel Begunkov unsigned int entries;
74234a3e608SPavel Begunkov netmem_ref netmem;
74334a3e608SPavel Begunkov
74434a3e608SPavel Begunkov spin_lock_bh(&ifq->rq_lock);
74534a3e608SPavel Begunkov
74634a3e608SPavel Begunkov entries = io_zcrx_rqring_entries(ifq);
74734a3e608SPavel Begunkov entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count);
74834a3e608SPavel Begunkov if (unlikely(!entries)) {
74934a3e608SPavel Begunkov spin_unlock_bh(&ifq->rq_lock);
75034a3e608SPavel Begunkov return;
75134a3e608SPavel Begunkov }
75234a3e608SPavel Begunkov
75334a3e608SPavel Begunkov do {
75434a3e608SPavel Begunkov struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
75534a3e608SPavel Begunkov struct io_zcrx_area *area;
75634a3e608SPavel Begunkov struct net_iov *niov;
75734a3e608SPavel Begunkov unsigned niov_idx, area_idx;
75834a3e608SPavel Begunkov
75934a3e608SPavel Begunkov area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT;
76034a3e608SPavel Begunkov niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT;
76134a3e608SPavel Begunkov
76234a3e608SPavel Begunkov if (unlikely(rqe->__pad || area_idx))
76334a3e608SPavel Begunkov continue;
76434a3e608SPavel Begunkov area = ifq->area;
76534a3e608SPavel Begunkov
76634a3e608SPavel Begunkov if (unlikely(niov_idx >= area->nia.num_niovs))
76734a3e608SPavel Begunkov continue;
76834a3e608SPavel Begunkov niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
76934a3e608SPavel Begunkov
77034a3e608SPavel Begunkov niov = &area->nia.niovs[niov_idx];
77134a3e608SPavel Begunkov if (!io_zcrx_put_niov_uref(niov))
77234a3e608SPavel Begunkov continue;
77334a3e608SPavel Begunkov
77434a3e608SPavel Begunkov netmem = net_iov_to_netmem(niov);
77534a3e608SPavel Begunkov if (page_pool_unref_netmem(netmem, 1) != 0)
77634a3e608SPavel Begunkov continue;
77734a3e608SPavel Begunkov
77834a3e608SPavel Begunkov if (unlikely(niov->pp != pp)) {
77934a3e608SPavel Begunkov io_zcrx_return_niov(niov);
78034a3e608SPavel Begunkov continue;
78134a3e608SPavel Begunkov }
78234a3e608SPavel Begunkov
783db070446SPavel Begunkov io_zcrx_sync_for_device(pp, niov);
78434a3e608SPavel Begunkov net_mp_netmem_place_in_cache(pp, netmem);
78534a3e608SPavel Begunkov } while (--entries);
78634a3e608SPavel Begunkov
78734a3e608SPavel Begunkov smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
78834a3e608SPavel Begunkov spin_unlock_bh(&ifq->rq_lock);
78934a3e608SPavel Begunkov }
79034a3e608SPavel Begunkov
io_zcrx_refill_slow(struct page_pool * pp,struct io_zcrx_ifq * ifq)79134a3e608SPavel Begunkov static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
79234a3e608SPavel Begunkov {
79334a3e608SPavel Begunkov struct io_zcrx_area *area = ifq->area;
79434a3e608SPavel Begunkov
79534a3e608SPavel Begunkov spin_lock_bh(&area->freelist_lock);
79634a3e608SPavel Begunkov while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
79734a3e608SPavel Begunkov struct net_iov *niov = __io_zcrx_get_free_niov(area);
79834a3e608SPavel Begunkov netmem_ref netmem = net_iov_to_netmem(niov);
79934a3e608SPavel Begunkov
80034a3e608SPavel Begunkov net_mp_niov_set_page_pool(pp, niov);
801db070446SPavel Begunkov io_zcrx_sync_for_device(pp, niov);
80234a3e608SPavel Begunkov net_mp_netmem_place_in_cache(pp, netmem);
80334a3e608SPavel Begunkov }
80434a3e608SPavel Begunkov spin_unlock_bh(&area->freelist_lock);
80534a3e608SPavel Begunkov }
80634a3e608SPavel Begunkov
io_pp_zc_alloc_netmems(struct page_pool * pp,gfp_t gfp)80734a3e608SPavel Begunkov static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
80834a3e608SPavel Begunkov {
80970e4f9bfSPavel Begunkov struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
81034a3e608SPavel Begunkov
81134a3e608SPavel Begunkov /* pp should already be ensuring that */
81234a3e608SPavel Begunkov if (unlikely(pp->alloc.count))
81334a3e608SPavel Begunkov goto out_return;
81434a3e608SPavel Begunkov
81534a3e608SPavel Begunkov io_zcrx_ring_refill(pp, ifq);
81634a3e608SPavel Begunkov if (likely(pp->alloc.count))
81734a3e608SPavel Begunkov goto out_return;
81834a3e608SPavel Begunkov
81934a3e608SPavel Begunkov io_zcrx_refill_slow(pp, ifq);
82034a3e608SPavel Begunkov if (!pp->alloc.count)
82134a3e608SPavel Begunkov return 0;
82234a3e608SPavel Begunkov out_return:
82334a3e608SPavel Begunkov return pp->alloc.cache[--pp->alloc.count];
82434a3e608SPavel Begunkov }
82534a3e608SPavel Begunkov
io_pp_zc_release_netmem(struct page_pool * pp,netmem_ref netmem)82634a3e608SPavel Begunkov static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
82734a3e608SPavel Begunkov {
82834a3e608SPavel Begunkov struct net_iov *niov;
82934a3e608SPavel Begunkov
83034a3e608SPavel Begunkov if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
83134a3e608SPavel Begunkov return false;
83234a3e608SPavel Begunkov
83334a3e608SPavel Begunkov niov = netmem_to_net_iov(netmem);
83434a3e608SPavel Begunkov net_mp_niov_clear_page_pool(niov);
83534a3e608SPavel Begunkov io_zcrx_return_niov_freelist(niov);
83634a3e608SPavel Begunkov return false;
83734a3e608SPavel Begunkov }
83834a3e608SPavel Begunkov
io_pp_zc_init(struct page_pool * pp)83934a3e608SPavel Begunkov static int io_pp_zc_init(struct page_pool *pp)
84034a3e608SPavel Begunkov {
84170e4f9bfSPavel Begunkov struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
842f12ecf5eSPavel Begunkov int ret;
84334a3e608SPavel Begunkov
84434a3e608SPavel Begunkov if (WARN_ON_ONCE(!ifq))
84534a3e608SPavel Begunkov return -EINVAL;
846db070446SPavel Begunkov if (WARN_ON_ONCE(ifq->dev != pp->p.dev))
847db070446SPavel Begunkov return -EINVAL;
848db070446SPavel Begunkov if (WARN_ON_ONCE(!pp->dma_map))
84934a3e608SPavel Begunkov return -EOPNOTSUPP;
85034a3e608SPavel Begunkov if (pp->p.order != 0)
85134a3e608SPavel Begunkov return -EOPNOTSUPP;
852db070446SPavel Begunkov if (pp->p.dma_dir != DMA_FROM_DEVICE)
853db070446SPavel Begunkov return -EOPNOTSUPP;
85434a3e608SPavel Begunkov
855f12ecf5eSPavel Begunkov ret = io_zcrx_map_area(ifq, ifq->area);
856f12ecf5eSPavel Begunkov if (ret)
857f12ecf5eSPavel Begunkov return ret;
858f12ecf5eSPavel Begunkov
85934a3e608SPavel Begunkov percpu_ref_get(&ifq->ctx->refs);
86034a3e608SPavel Begunkov return 0;
86134a3e608SPavel Begunkov }
86234a3e608SPavel Begunkov
io_pp_zc_destroy(struct page_pool * pp)86334a3e608SPavel Begunkov static void io_pp_zc_destroy(struct page_pool *pp)
86434a3e608SPavel Begunkov {
86570e4f9bfSPavel Begunkov struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
86634a3e608SPavel Begunkov struct io_zcrx_area *area = ifq->area;
86734a3e608SPavel Begunkov
86834a3e608SPavel Begunkov if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
86934a3e608SPavel Begunkov return;
87034a3e608SPavel Begunkov percpu_ref_put(&ifq->ctx->refs);
87134a3e608SPavel Begunkov }
87234a3e608SPavel Begunkov
io_pp_nl_fill(void * mp_priv,struct sk_buff * rsp,struct netdev_rx_queue * rxq)87334a3e608SPavel Begunkov static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp,
87434a3e608SPavel Begunkov struct netdev_rx_queue *rxq)
87534a3e608SPavel Begunkov {
87634a3e608SPavel Begunkov struct nlattr *nest;
87734a3e608SPavel Begunkov int type;
87834a3e608SPavel Begunkov
87934a3e608SPavel Begunkov type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING;
88034a3e608SPavel Begunkov nest = nla_nest_start(rsp, type);
88134a3e608SPavel Begunkov if (!nest)
88234a3e608SPavel Begunkov return -EMSGSIZE;
88334a3e608SPavel Begunkov nla_nest_end(rsp, nest);
88434a3e608SPavel Begunkov
88534a3e608SPavel Begunkov return 0;
88634a3e608SPavel Begunkov }
88734a3e608SPavel Begunkov
io_pp_uninstall(void * mp_priv,struct netdev_rx_queue * rxq)88834a3e608SPavel Begunkov static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
88934a3e608SPavel Begunkov {
89034a3e608SPavel Begunkov struct pp_memory_provider_params *p = &rxq->mp_params;
89134a3e608SPavel Begunkov struct io_zcrx_ifq *ifq = mp_priv;
89234a3e608SPavel Begunkov
89334a3e608SPavel Begunkov io_zcrx_drop_netdev(ifq);
894f12ecf5eSPavel Begunkov if (ifq->area)
895f12ecf5eSPavel Begunkov io_zcrx_unmap_area(ifq, ifq->area);
896f12ecf5eSPavel Begunkov
89734a3e608SPavel Begunkov p->mp_ops = NULL;
89834a3e608SPavel Begunkov p->mp_priv = NULL;
89934a3e608SPavel Begunkov }
90034a3e608SPavel Begunkov
90134a3e608SPavel Begunkov static const struct memory_provider_ops io_uring_pp_zc_ops = {
90234a3e608SPavel Begunkov .alloc_netmems = io_pp_zc_alloc_netmems,
90334a3e608SPavel Begunkov .release_netmem = io_pp_zc_release_netmem,
90434a3e608SPavel Begunkov .init = io_pp_zc_init,
90534a3e608SPavel Begunkov .destroy = io_pp_zc_destroy,
90634a3e608SPavel Begunkov .nl_fill = io_pp_nl_fill,
90734a3e608SPavel Begunkov .uninstall = io_pp_uninstall,
90834a3e608SPavel Begunkov };
90911ed914bSDavid Wei
io_zcrx_queue_cqe(struct io_kiocb * req,struct net_iov * niov,struct io_zcrx_ifq * ifq,int off,int len)91011ed914bSDavid Wei static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
91111ed914bSDavid Wei struct io_zcrx_ifq *ifq, int off, int len)
91211ed914bSDavid Wei {
91311ed914bSDavid Wei struct io_uring_zcrx_cqe *rcqe;
91411ed914bSDavid Wei struct io_zcrx_area *area;
91511ed914bSDavid Wei struct io_uring_cqe *cqe;
91611ed914bSDavid Wei u64 offset;
91711ed914bSDavid Wei
91811ed914bSDavid Wei if (!io_defer_get_uncommited_cqe(req->ctx, &cqe))
91911ed914bSDavid Wei return false;
92011ed914bSDavid Wei
92111ed914bSDavid Wei cqe->user_data = req->cqe.user_data;
92211ed914bSDavid Wei cqe->res = len;
92311ed914bSDavid Wei cqe->flags = IORING_CQE_F_MORE;
92411ed914bSDavid Wei
92511ed914bSDavid Wei area = io_zcrx_iov_to_area(niov);
92611ed914bSDavid Wei offset = off + (net_iov_idx(niov) << PAGE_SHIFT);
92711ed914bSDavid Wei rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1);
92811ed914bSDavid Wei rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT);
92911ed914bSDavid Wei rcqe->__pad = 0;
93011ed914bSDavid Wei return true;
93111ed914bSDavid Wei }
93211ed914bSDavid Wei
io_zcrx_alloc_fallback(struct io_zcrx_area * area)933bc57c7d3SPavel Begunkov static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area)
934bc57c7d3SPavel Begunkov {
935bc57c7d3SPavel Begunkov struct net_iov *niov = NULL;
936bc57c7d3SPavel Begunkov
937bc57c7d3SPavel Begunkov spin_lock_bh(&area->freelist_lock);
938bc57c7d3SPavel Begunkov if (area->free_count)
939bc57c7d3SPavel Begunkov niov = __io_zcrx_get_free_niov(area);
940bc57c7d3SPavel Begunkov spin_unlock_bh(&area->freelist_lock);
941bc57c7d3SPavel Begunkov
942bc57c7d3SPavel Begunkov if (niov)
943bc57c7d3SPavel Begunkov page_pool_fragment_netmem(net_iov_to_netmem(niov), 1);
944bc57c7d3SPavel Begunkov return niov;
945bc57c7d3SPavel Begunkov }
946bc57c7d3SPavel Begunkov
io_zcrx_copy_chunk(struct io_kiocb * req,struct io_zcrx_ifq * ifq,void * src_base,struct page * src_page,unsigned int src_offset,size_t len)947bc57c7d3SPavel Begunkov static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
948bc57c7d3SPavel Begunkov void *src_base, struct page *src_page,
949bc57c7d3SPavel Begunkov unsigned int src_offset, size_t len)
950bc57c7d3SPavel Begunkov {
951bc57c7d3SPavel Begunkov struct io_zcrx_area *area = ifq->area;
952bc57c7d3SPavel Begunkov size_t copied = 0;
953bc57c7d3SPavel Begunkov int ret = 0;
954bc57c7d3SPavel Begunkov
955a5c98e94SPavel Begunkov if (area->mem.is_dmabuf)
956a5c98e94SPavel Begunkov return -EFAULT;
957a5c98e94SPavel Begunkov
958bc57c7d3SPavel Begunkov while (len) {
959bc57c7d3SPavel Begunkov size_t copy_size = min_t(size_t, PAGE_SIZE, len);
960bc57c7d3SPavel Begunkov const int dst_off = 0;
961bc57c7d3SPavel Begunkov struct net_iov *niov;
962bc57c7d3SPavel Begunkov struct page *dst_page;
963bc57c7d3SPavel Begunkov void *dst_addr;
964bc57c7d3SPavel Begunkov
965bc57c7d3SPavel Begunkov niov = io_zcrx_alloc_fallback(area);
966bc57c7d3SPavel Begunkov if (!niov) {
967bc57c7d3SPavel Begunkov ret = -ENOMEM;
968bc57c7d3SPavel Begunkov break;
969bc57c7d3SPavel Begunkov }
970bc57c7d3SPavel Begunkov
971bc57c7d3SPavel Begunkov dst_page = io_zcrx_iov_page(niov);
972bc57c7d3SPavel Begunkov dst_addr = kmap_local_page(dst_page);
973bc57c7d3SPavel Begunkov if (src_page)
974bc57c7d3SPavel Begunkov src_base = kmap_local_page(src_page);
975bc57c7d3SPavel Begunkov
976bc57c7d3SPavel Begunkov memcpy(dst_addr, src_base + src_offset, copy_size);
977bc57c7d3SPavel Begunkov
978bc57c7d3SPavel Begunkov if (src_page)
979bc57c7d3SPavel Begunkov kunmap_local(src_base);
980bc57c7d3SPavel Begunkov kunmap_local(dst_addr);
981bc57c7d3SPavel Begunkov
982bc57c7d3SPavel Begunkov if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) {
983bc57c7d3SPavel Begunkov io_zcrx_return_niov(niov);
984bc57c7d3SPavel Begunkov ret = -ENOSPC;
985bc57c7d3SPavel Begunkov break;
986bc57c7d3SPavel Begunkov }
987bc57c7d3SPavel Begunkov
988bc57c7d3SPavel Begunkov io_zcrx_get_niov_uref(niov);
989bc57c7d3SPavel Begunkov src_offset += copy_size;
990bc57c7d3SPavel Begunkov len -= copy_size;
991bc57c7d3SPavel Begunkov copied += copy_size;
992bc57c7d3SPavel Begunkov }
993bc57c7d3SPavel Begunkov
994bc57c7d3SPavel Begunkov return copied ? copied : ret;
995bc57c7d3SPavel Begunkov }
996bc57c7d3SPavel Begunkov
io_zcrx_copy_frag(struct io_kiocb * req,struct io_zcrx_ifq * ifq,const skb_frag_t * frag,int off,int len)997bc57c7d3SPavel Begunkov static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
998bc57c7d3SPavel Begunkov const skb_frag_t *frag, int off, int len)
999bc57c7d3SPavel Begunkov {
1000bc57c7d3SPavel Begunkov struct page *page = skb_frag_page(frag);
1001bc57c7d3SPavel Begunkov u32 p_off, p_len, t, copied = 0;
1002bc57c7d3SPavel Begunkov int ret = 0;
1003bc57c7d3SPavel Begunkov
1004bc57c7d3SPavel Begunkov off += skb_frag_off(frag);
1005bc57c7d3SPavel Begunkov
1006bc57c7d3SPavel Begunkov skb_frag_foreach_page(frag, off, len,
1007bc57c7d3SPavel Begunkov page, p_off, p_len, t) {
1008bc57c7d3SPavel Begunkov ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len);
1009bc57c7d3SPavel Begunkov if (ret < 0)
1010bc57c7d3SPavel Begunkov return copied ? copied : ret;
1011bc57c7d3SPavel Begunkov copied += ret;
1012bc57c7d3SPavel Begunkov }
1013bc57c7d3SPavel Begunkov return copied;
1014bc57c7d3SPavel Begunkov }
1015bc57c7d3SPavel Begunkov
io_zcrx_recv_frag(struct io_kiocb * req,struct io_zcrx_ifq * ifq,const skb_frag_t * frag,int off,int len)101611ed914bSDavid Wei static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
101711ed914bSDavid Wei const skb_frag_t *frag, int off, int len)
101811ed914bSDavid Wei {
101911ed914bSDavid Wei struct net_iov *niov;
102011ed914bSDavid Wei
102111ed914bSDavid Wei if (unlikely(!skb_frag_is_net_iov(frag)))
1022bc57c7d3SPavel Begunkov return io_zcrx_copy_frag(req, ifq, frag, off, len);
102311ed914bSDavid Wei
102411ed914bSDavid Wei niov = netmem_to_net_iov(frag->netmem);
1025bd618489SMina Almasry if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops ||
102670e4f9bfSPavel Begunkov io_pp_to_ifq(niov->pp) != ifq)
102711ed914bSDavid Wei return -EFAULT;
102811ed914bSDavid Wei
102911ed914bSDavid Wei if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
103011ed914bSDavid Wei return -ENOSPC;
103111ed914bSDavid Wei
103211ed914bSDavid Wei /*
103311ed914bSDavid Wei * Prevent it from being recycled while user is accessing it.
103411ed914bSDavid Wei * It has to be done before grabbing a user reference.
103511ed914bSDavid Wei */
103611ed914bSDavid Wei page_pool_ref_netmem(net_iov_to_netmem(niov));
103711ed914bSDavid Wei io_zcrx_get_niov_uref(niov);
103811ed914bSDavid Wei return len;
103911ed914bSDavid Wei }
104011ed914bSDavid Wei
104111ed914bSDavid Wei static int
io_zcrx_recv_skb(read_descriptor_t * desc,struct sk_buff * skb,unsigned int offset,size_t len)104211ed914bSDavid Wei io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
104311ed914bSDavid Wei unsigned int offset, size_t len)
104411ed914bSDavid Wei {
104511ed914bSDavid Wei struct io_zcrx_args *args = desc->arg.data;
104611ed914bSDavid Wei struct io_zcrx_ifq *ifq = args->ifq;
104711ed914bSDavid Wei struct io_kiocb *req = args->req;
104811ed914bSDavid Wei struct sk_buff *frag_iter;
1049bc57c7d3SPavel Begunkov unsigned start, start_off = offset;
105011ed914bSDavid Wei int i, copy, end, off;
105111ed914bSDavid Wei int ret = 0;
105211ed914bSDavid Wei
10536699ec9aSDavid Wei len = min_t(size_t, len, desc->count);
1054fcfd94d6SDavid Wei /*
1055fcfd94d6SDavid Wei * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even
1056fcfd94d6SDavid Wei * if desc->count is already 0. This is caused by the if (offset + 1 !=
1057fcfd94d6SDavid Wei * skb->len) check. Return early in this case to break out of
1058fcfd94d6SDavid Wei * __tcp_read_sock().
1059fcfd94d6SDavid Wei */
1060fcfd94d6SDavid Wei if (!len)
1061fcfd94d6SDavid Wei return 0;
1062931dfae1SPavel Begunkov if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT))
1063931dfae1SPavel Begunkov return -EAGAIN;
1064931dfae1SPavel Begunkov
1065bc57c7d3SPavel Begunkov if (unlikely(offset < skb_headlen(skb))) {
1066bc57c7d3SPavel Begunkov ssize_t copied;
1067bc57c7d3SPavel Begunkov size_t to_copy;
106811ed914bSDavid Wei
1069bc57c7d3SPavel Begunkov to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
1070bc57c7d3SPavel Begunkov copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL,
1071bc57c7d3SPavel Begunkov offset, to_copy);
1072bc57c7d3SPavel Begunkov if (copied < 0) {
1073bc57c7d3SPavel Begunkov ret = copied;
1074bc57c7d3SPavel Begunkov goto out;
1075bc57c7d3SPavel Begunkov }
1076bc57c7d3SPavel Begunkov offset += copied;
1077bc57c7d3SPavel Begunkov len -= copied;
1078bc57c7d3SPavel Begunkov if (!len)
1079bc57c7d3SPavel Begunkov goto out;
1080bc57c7d3SPavel Begunkov if (offset != skb_headlen(skb))
1081bc57c7d3SPavel Begunkov goto out;
1082bc57c7d3SPavel Begunkov }
1083bc57c7d3SPavel Begunkov
1084bc57c7d3SPavel Begunkov start = skb_headlen(skb);
108511ed914bSDavid Wei
108611ed914bSDavid Wei for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
108711ed914bSDavid Wei const skb_frag_t *frag;
108811ed914bSDavid Wei
108911ed914bSDavid Wei if (WARN_ON(start > offset + len))
109011ed914bSDavid Wei return -EFAULT;
109111ed914bSDavid Wei
109211ed914bSDavid Wei frag = &skb_shinfo(skb)->frags[i];
109311ed914bSDavid Wei end = start + skb_frag_size(frag);
109411ed914bSDavid Wei
109511ed914bSDavid Wei if (offset < end) {
109611ed914bSDavid Wei copy = end - offset;
109711ed914bSDavid Wei if (copy > len)
109811ed914bSDavid Wei copy = len;
109911ed914bSDavid Wei
110011ed914bSDavid Wei off = offset - start;
110111ed914bSDavid Wei ret = io_zcrx_recv_frag(req, ifq, frag, off, copy);
110211ed914bSDavid Wei if (ret < 0)
110311ed914bSDavid Wei goto out;
110411ed914bSDavid Wei
110511ed914bSDavid Wei offset += ret;
110611ed914bSDavid Wei len -= ret;
110711ed914bSDavid Wei if (len == 0 || ret != copy)
110811ed914bSDavid Wei goto out;
110911ed914bSDavid Wei }
111011ed914bSDavid Wei start = end;
111111ed914bSDavid Wei }
111211ed914bSDavid Wei
111311ed914bSDavid Wei skb_walk_frags(skb, frag_iter) {
111411ed914bSDavid Wei if (WARN_ON(start > offset + len))
111511ed914bSDavid Wei return -EFAULT;
111611ed914bSDavid Wei
111711ed914bSDavid Wei end = start + frag_iter->len;
111811ed914bSDavid Wei if (offset < end) {
111911ed914bSDavid Wei copy = end - offset;
112011ed914bSDavid Wei if (copy > len)
112111ed914bSDavid Wei copy = len;
112211ed914bSDavid Wei
112311ed914bSDavid Wei off = offset - start;
112411ed914bSDavid Wei ret = io_zcrx_recv_skb(desc, frag_iter, off, copy);
112511ed914bSDavid Wei if (ret < 0)
112611ed914bSDavid Wei goto out;
112711ed914bSDavid Wei
112811ed914bSDavid Wei offset += ret;
112911ed914bSDavid Wei len -= ret;
113011ed914bSDavid Wei if (len == 0 || ret != copy)
113111ed914bSDavid Wei goto out;
113211ed914bSDavid Wei }
113311ed914bSDavid Wei start = end;
113411ed914bSDavid Wei }
113511ed914bSDavid Wei
113611ed914bSDavid Wei out:
113711ed914bSDavid Wei if (offset == start_off)
113811ed914bSDavid Wei return ret;
11396699ec9aSDavid Wei desc->count -= (offset - start_off);
114011ed914bSDavid Wei return offset - start_off;
114111ed914bSDavid Wei }
114211ed914bSDavid Wei
io_zcrx_tcp_recvmsg(struct io_kiocb * req,struct io_zcrx_ifq * ifq,struct sock * sk,int flags,unsigned issue_flags,unsigned int * outlen)114311ed914bSDavid Wei static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
114411ed914bSDavid Wei struct sock *sk, int flags,
11456699ec9aSDavid Wei unsigned issue_flags, unsigned int *outlen)
114611ed914bSDavid Wei {
11476699ec9aSDavid Wei unsigned int len = *outlen;
114811ed914bSDavid Wei struct io_zcrx_args args = {
114911ed914bSDavid Wei .req = req,
115011ed914bSDavid Wei .ifq = ifq,
115111ed914bSDavid Wei .sock = sk->sk_socket,
115211ed914bSDavid Wei };
115311ed914bSDavid Wei read_descriptor_t rd_desc = {
11546699ec9aSDavid Wei .count = len ? len : UINT_MAX,
115511ed914bSDavid Wei .arg.data = &args,
115611ed914bSDavid Wei };
115711ed914bSDavid Wei int ret;
115811ed914bSDavid Wei
115911ed914bSDavid Wei lock_sock(sk);
116011ed914bSDavid Wei ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb);
11616699ec9aSDavid Wei if (len && ret > 0)
11626699ec9aSDavid Wei *outlen = len - ret;
116311ed914bSDavid Wei if (ret <= 0) {
116411ed914bSDavid Wei if (ret < 0 || sock_flag(sk, SOCK_DONE))
116511ed914bSDavid Wei goto out;
116611ed914bSDavid Wei if (sk->sk_err)
116711ed914bSDavid Wei ret = sock_error(sk);
116811ed914bSDavid Wei else if (sk->sk_shutdown & RCV_SHUTDOWN)
116911ed914bSDavid Wei goto out;
117011ed914bSDavid Wei else if (sk->sk_state == TCP_CLOSE)
117111ed914bSDavid Wei ret = -ENOTCONN;
117211ed914bSDavid Wei else
117311ed914bSDavid Wei ret = -EAGAIN;
1174931dfae1SPavel Begunkov } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) &&
1175931dfae1SPavel Begunkov (issue_flags & IO_URING_F_MULTISHOT)) {
1176931dfae1SPavel Begunkov ret = IOU_REQUEUE;
117711ed914bSDavid Wei } else if (sock_flag(sk, SOCK_DONE)) {
117811ed914bSDavid Wei /* Make it to retry until it finally gets 0. */
117911ed914bSDavid Wei if (issue_flags & IO_URING_F_MULTISHOT)
118011ed914bSDavid Wei ret = IOU_REQUEUE;
118111ed914bSDavid Wei else
118211ed914bSDavid Wei ret = -EAGAIN;
118311ed914bSDavid Wei }
118411ed914bSDavid Wei out:
118511ed914bSDavid Wei release_sock(sk);
118611ed914bSDavid Wei return ret;
118711ed914bSDavid Wei }
118811ed914bSDavid Wei
io_zcrx_recv(struct io_kiocb * req,struct io_zcrx_ifq * ifq,struct socket * sock,unsigned int flags,unsigned issue_flags,unsigned int * len)118911ed914bSDavid Wei int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
119011ed914bSDavid Wei struct socket *sock, unsigned int flags,
11916699ec9aSDavid Wei unsigned issue_flags, unsigned int *len)
119211ed914bSDavid Wei {
119311ed914bSDavid Wei struct sock *sk = sock->sk;
119411ed914bSDavid Wei const struct proto *prot = READ_ONCE(sk->sk_prot);
119511ed914bSDavid Wei
119611ed914bSDavid Wei if (prot->recvmsg != tcp_recvmsg)
119711ed914bSDavid Wei return -EPROTONOSUPPORT;
119811ed914bSDavid Wei
119911ed914bSDavid Wei sock_rps_record_flow(sk);
12006699ec9aSDavid Wei return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len);
120111ed914bSDavid Wei }
1202