xref: /linux/io_uring/zcrx.c (revision 0a47e02d8a283a99592876556b9d42e087525828)
16f377873SDavid Wei // SPDX-License-Identifier: GPL-2.0
26f377873SDavid Wei #include <linux/kernel.h>
36f377873SDavid Wei #include <linux/errno.h>
4db070446SPavel Begunkov #include <linux/dma-map-ops.h>
56f377873SDavid Wei #include <linux/mm.h>
634a3e608SPavel Begunkov #include <linux/nospec.h>
76f377873SDavid Wei #include <linux/io_uring.h>
8035af94bSPavel Begunkov #include <linux/netdevice.h>
9035af94bSPavel Begunkov #include <linux/rtnetlink.h>
10bc57c7d3SPavel Begunkov #include <linux/skbuff_ref.h>
116f377873SDavid Wei 
1234a3e608SPavel Begunkov #include <net/page_pool/helpers.h>
1334a3e608SPavel Begunkov #include <net/page_pool/memory_provider.h>
1434a3e608SPavel Begunkov #include <net/netlink.h>
15e0793de2SDavid Wei #include <net/netdev_rx_queue.h>
1611ed914bSDavid Wei #include <net/tcp.h>
1711ed914bSDavid Wei #include <net/rps.h>
1834a3e608SPavel Begunkov 
19e0793de2SDavid Wei #include <trace/events/page_pool.h>
20e0793de2SDavid Wei 
216f377873SDavid Wei #include <uapi/linux/io_uring.h>
226f377873SDavid Wei 
236f377873SDavid Wei #include "io_uring.h"
246f377873SDavid Wei #include "kbuf.h"
256f377873SDavid Wei #include "memmap.h"
266f377873SDavid Wei #include "zcrx.h"
27cf96310cSDavid Wei #include "rsrc.h"
286f377873SDavid Wei 
29782dfa32SPavel Begunkov #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
30782dfa32SPavel Begunkov 
io_pp_to_ifq(struct page_pool * pp)3170e4f9bfSPavel Begunkov static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
3270e4f9bfSPavel Begunkov {
3370e4f9bfSPavel Begunkov 	return pp->mp_priv;
3470e4f9bfSPavel Begunkov }
3570e4f9bfSPavel Begunkov 
io_zcrx_iov_to_area(const struct net_iov * niov)36a79154aeSPavel Begunkov static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
37a79154aeSPavel Begunkov {
38a79154aeSPavel Begunkov 	struct net_iov_area *owner = net_iov_owner(niov);
39a79154aeSPavel Begunkov 
40a79154aeSPavel Begunkov 	return container_of(owner, struct io_zcrx_area, nia);
41a79154aeSPavel Begunkov }
42a79154aeSPavel Begunkov 
io_zcrx_iov_page(const struct net_iov * niov)43a79154aeSPavel Begunkov static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
44a79154aeSPavel Begunkov {
45a79154aeSPavel Begunkov 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
46a79154aeSPavel Begunkov 
47782dfa32SPavel Begunkov 	return area->mem.pages[net_iov_idx(niov)];
48a79154aeSPavel Begunkov }
49a79154aeSPavel Begunkov 
io_release_dmabuf(struct io_zcrx_mem * mem)50a5c98e94SPavel Begunkov static void io_release_dmabuf(struct io_zcrx_mem *mem)
51a5c98e94SPavel Begunkov {
52a5c98e94SPavel Begunkov 	if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
53a5c98e94SPavel Begunkov 		return;
54a5c98e94SPavel Begunkov 
55a5c98e94SPavel Begunkov 	if (mem->sgt)
56a5c98e94SPavel Begunkov 		dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt,
57a5c98e94SPavel Begunkov 						  DMA_FROM_DEVICE);
58a5c98e94SPavel Begunkov 	if (mem->attach)
59a5c98e94SPavel Begunkov 		dma_buf_detach(mem->dmabuf, mem->attach);
60a5c98e94SPavel Begunkov 	if (mem->dmabuf)
61a5c98e94SPavel Begunkov 		dma_buf_put(mem->dmabuf);
62a5c98e94SPavel Begunkov 
63a5c98e94SPavel Begunkov 	mem->sgt = NULL;
64a5c98e94SPavel Begunkov 	mem->attach = NULL;
65a5c98e94SPavel Begunkov 	mem->dmabuf = NULL;
66a5c98e94SPavel Begunkov }
67a5c98e94SPavel Begunkov 
io_import_dmabuf(struct io_zcrx_ifq * ifq,struct io_zcrx_mem * mem,struct io_uring_zcrx_area_reg * area_reg)68a5c98e94SPavel Begunkov static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
69a5c98e94SPavel Begunkov 			    struct io_zcrx_mem *mem,
70a5c98e94SPavel Begunkov 			    struct io_uring_zcrx_area_reg *area_reg)
71a5c98e94SPavel Begunkov {
72a5c98e94SPavel Begunkov 	unsigned long off = (unsigned long)area_reg->addr;
73a5c98e94SPavel Begunkov 	unsigned long len = (unsigned long)area_reg->len;
74a5c98e94SPavel Begunkov 	unsigned long total_size = 0;
75a5c98e94SPavel Begunkov 	struct scatterlist *sg;
76a5c98e94SPavel Begunkov 	int dmabuf_fd = area_reg->dmabuf_fd;
77a5c98e94SPavel Begunkov 	int i, ret;
78a5c98e94SPavel Begunkov 
79a5c98e94SPavel Begunkov 	if (WARN_ON_ONCE(!ifq->dev))
80a5c98e94SPavel Begunkov 		return -EFAULT;
81a5c98e94SPavel Begunkov 	if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
82a5c98e94SPavel Begunkov 		return -EINVAL;
83a5c98e94SPavel Begunkov 
84a5c98e94SPavel Begunkov 	mem->is_dmabuf = true;
85a5c98e94SPavel Begunkov 	mem->dmabuf = dma_buf_get(dmabuf_fd);
86a5c98e94SPavel Begunkov 	if (IS_ERR(mem->dmabuf)) {
87a5c98e94SPavel Begunkov 		ret = PTR_ERR(mem->dmabuf);
88a5c98e94SPavel Begunkov 		mem->dmabuf = NULL;
89a5c98e94SPavel Begunkov 		goto err;
90a5c98e94SPavel Begunkov 	}
91a5c98e94SPavel Begunkov 
92a5c98e94SPavel Begunkov 	mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev);
93a5c98e94SPavel Begunkov 	if (IS_ERR(mem->attach)) {
94a5c98e94SPavel Begunkov 		ret = PTR_ERR(mem->attach);
95a5c98e94SPavel Begunkov 		mem->attach = NULL;
96a5c98e94SPavel Begunkov 		goto err;
97a5c98e94SPavel Begunkov 	}
98a5c98e94SPavel Begunkov 
99a5c98e94SPavel Begunkov 	mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE);
100a5c98e94SPavel Begunkov 	if (IS_ERR(mem->sgt)) {
101a5c98e94SPavel Begunkov 		ret = PTR_ERR(mem->sgt);
102a5c98e94SPavel Begunkov 		mem->sgt = NULL;
103a5c98e94SPavel Begunkov 		goto err;
104a5c98e94SPavel Begunkov 	}
105a5c98e94SPavel Begunkov 
106a5c98e94SPavel Begunkov 	for_each_sgtable_dma_sg(mem->sgt, sg, i)
107a5c98e94SPavel Begunkov 		total_size += sg_dma_len(sg);
108a5c98e94SPavel Begunkov 
109*7cac633aSPenglei Jiang 	if (total_size < off + len) {
110*7cac633aSPenglei Jiang 		ret = -EINVAL;
111*7cac633aSPenglei Jiang 		goto err;
112*7cac633aSPenglei Jiang 	}
113a5c98e94SPavel Begunkov 
114a5c98e94SPavel Begunkov 	mem->dmabuf_offset = off;
115a5c98e94SPavel Begunkov 	mem->size = len;
116a5c98e94SPavel Begunkov 	return 0;
117a5c98e94SPavel Begunkov err:
118a5c98e94SPavel Begunkov 	io_release_dmabuf(mem);
119a5c98e94SPavel Begunkov 	return ret;
120a5c98e94SPavel Begunkov }
121a5c98e94SPavel Begunkov 
io_zcrx_map_area_dmabuf(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)122a5c98e94SPavel Begunkov static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
123a5c98e94SPavel Begunkov {
124a5c98e94SPavel Begunkov 	unsigned long off = area->mem.dmabuf_offset;
125a5c98e94SPavel Begunkov 	struct scatterlist *sg;
126a5c98e94SPavel Begunkov 	unsigned i, niov_idx = 0;
127a5c98e94SPavel Begunkov 
128a5c98e94SPavel Begunkov 	if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
129a5c98e94SPavel Begunkov 		return -EINVAL;
130a5c98e94SPavel Begunkov 
131a5c98e94SPavel Begunkov 	for_each_sgtable_dma_sg(area->mem.sgt, sg, i) {
132a5c98e94SPavel Begunkov 		dma_addr_t dma = sg_dma_address(sg);
133a5c98e94SPavel Begunkov 		unsigned long sg_len = sg_dma_len(sg);
134a5c98e94SPavel Begunkov 		unsigned long sg_off = min(sg_len, off);
135a5c98e94SPavel Begunkov 
136a5c98e94SPavel Begunkov 		off -= sg_off;
137a5c98e94SPavel Begunkov 		sg_len -= sg_off;
138a5c98e94SPavel Begunkov 		dma += sg_off;
139a5c98e94SPavel Begunkov 
140a5c98e94SPavel Begunkov 		while (sg_len && niov_idx < area->nia.num_niovs) {
141a5c98e94SPavel Begunkov 			struct net_iov *niov = &area->nia.niovs[niov_idx];
142a5c98e94SPavel Begunkov 
143a5c98e94SPavel Begunkov 			if (net_mp_niov_set_dma_addr(niov, dma))
144a5c98e94SPavel Begunkov 				return 0;
145a5c98e94SPavel Begunkov 			sg_len -= PAGE_SIZE;
146a5c98e94SPavel Begunkov 			dma += PAGE_SIZE;
147a5c98e94SPavel Begunkov 			niov_idx++;
148a5c98e94SPavel Begunkov 		}
149a5c98e94SPavel Begunkov 	}
150a5c98e94SPavel Begunkov 	return niov_idx;
151a5c98e94SPavel Begunkov }
152a5c98e94SPavel Begunkov 
io_import_umem(struct io_zcrx_ifq * ifq,struct io_zcrx_mem * mem,struct io_uring_zcrx_area_reg * area_reg)153a5c98e94SPavel Begunkov static int io_import_umem(struct io_zcrx_ifq *ifq,
154a5c98e94SPavel Begunkov 			  struct io_zcrx_mem *mem,
155a5c98e94SPavel Begunkov 			  struct io_uring_zcrx_area_reg *area_reg)
156a5c98e94SPavel Begunkov {
157a5c98e94SPavel Begunkov 	struct page **pages;
158a5c98e94SPavel Begunkov 	int nr_pages;
159a5c98e94SPavel Begunkov 
160a5c98e94SPavel Begunkov 	if (area_reg->dmabuf_fd)
161a5c98e94SPavel Begunkov 		return -EINVAL;
162a5c98e94SPavel Begunkov 	if (!area_reg->addr)
163a5c98e94SPavel Begunkov 		return -EFAULT;
164a5c98e94SPavel Begunkov 	pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
165a5c98e94SPavel Begunkov 				   &nr_pages);
166a5c98e94SPavel Begunkov 	if (IS_ERR(pages))
167a5c98e94SPavel Begunkov 		return PTR_ERR(pages);
168a5c98e94SPavel Begunkov 
169a5c98e94SPavel Begunkov 	mem->pages = pages;
170a5c98e94SPavel Begunkov 	mem->nr_folios = nr_pages;
171a5c98e94SPavel Begunkov 	mem->size = area_reg->len;
172a5c98e94SPavel Begunkov 	return 0;
173a5c98e94SPavel Begunkov }
174a5c98e94SPavel Begunkov 
io_release_area_mem(struct io_zcrx_mem * mem)175782dfa32SPavel Begunkov static void io_release_area_mem(struct io_zcrx_mem *mem)
176782dfa32SPavel Begunkov {
177a5c98e94SPavel Begunkov 	if (mem->is_dmabuf) {
178a5c98e94SPavel Begunkov 		io_release_dmabuf(mem);
179a5c98e94SPavel Begunkov 		return;
180a5c98e94SPavel Begunkov 	}
181782dfa32SPavel Begunkov 	if (mem->pages) {
182782dfa32SPavel Begunkov 		unpin_user_pages(mem->pages, mem->nr_folios);
183782dfa32SPavel Begunkov 		kvfree(mem->pages);
184782dfa32SPavel Begunkov 	}
185782dfa32SPavel Begunkov }
186782dfa32SPavel Begunkov 
io_import_area(struct io_zcrx_ifq * ifq,struct io_zcrx_mem * mem,struct io_uring_zcrx_area_reg * area_reg)187782dfa32SPavel Begunkov static int io_import_area(struct io_zcrx_ifq *ifq,
188782dfa32SPavel Begunkov 			  struct io_zcrx_mem *mem,
189782dfa32SPavel Begunkov 			  struct io_uring_zcrx_area_reg *area_reg)
190782dfa32SPavel Begunkov {
191782dfa32SPavel Begunkov 	int ret;
192782dfa32SPavel Begunkov 
193782dfa32SPavel Begunkov 	ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
194782dfa32SPavel Begunkov 	if (ret)
195782dfa32SPavel Begunkov 		return ret;
196782dfa32SPavel Begunkov 	if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
197782dfa32SPavel Begunkov 		return -EINVAL;
198782dfa32SPavel Begunkov 
199a5c98e94SPavel Begunkov 	if (area_reg->flags & IORING_ZCRX_AREA_DMABUF)
200a5c98e94SPavel Begunkov 		return io_import_dmabuf(ifq, mem, area_reg);
201a5c98e94SPavel Begunkov 	return io_import_umem(ifq, mem, area_reg);
202782dfa32SPavel Begunkov }
203db070446SPavel Begunkov 
io_zcrx_unmap_umem(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area,int nr_mapped)2048a628042SPavel Begunkov static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq,
205db070446SPavel Begunkov 				struct io_zcrx_area *area, int nr_mapped)
206db070446SPavel Begunkov {
207db070446SPavel Begunkov 	int i;
208db070446SPavel Begunkov 
209db070446SPavel Begunkov 	for (i = 0; i < nr_mapped; i++) {
2108a628042SPavel Begunkov 		netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]);
2118a628042SPavel Begunkov 		dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem);
212db070446SPavel Begunkov 
213db070446SPavel Begunkov 		dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
214db070446SPavel Begunkov 				     DMA_FROM_DEVICE, IO_DMA_ATTR);
215db070446SPavel Begunkov 	}
216db070446SPavel Begunkov }
217db070446SPavel Begunkov 
__io_zcrx_unmap_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area,int nr_mapped)2188a628042SPavel Begunkov static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
2198a628042SPavel Begunkov 				 struct io_zcrx_area *area, int nr_mapped)
2208a628042SPavel Begunkov {
2218a628042SPavel Begunkov 	int i;
2228a628042SPavel Begunkov 
223a5c98e94SPavel Begunkov 	if (area->mem.is_dmabuf)
224a5c98e94SPavel Begunkov 		io_release_dmabuf(&area->mem);
225a5c98e94SPavel Begunkov 	else
2268a628042SPavel Begunkov 		io_zcrx_unmap_umem(ifq, area, nr_mapped);
2278a628042SPavel Begunkov 
2288a628042SPavel Begunkov 	for (i = 0; i < area->nia.num_niovs; i++)
2298a628042SPavel Begunkov 		net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
2308a628042SPavel Begunkov }
2318a628042SPavel Begunkov 
io_zcrx_unmap_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)232db070446SPavel Begunkov static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
233db070446SPavel Begunkov {
234f12ecf5eSPavel Begunkov 	guard(mutex)(&ifq->dma_lock);
235f12ecf5eSPavel Begunkov 
236db070446SPavel Begunkov 	if (area->is_mapped)
237db070446SPavel Begunkov 		__io_zcrx_unmap_area(ifq, area, area->nia.num_niovs);
238f12ecf5eSPavel Begunkov 	area->is_mapped = false;
239db070446SPavel Begunkov }
240db070446SPavel Begunkov 
io_zcrx_map_area_umem(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)2418a628042SPavel Begunkov static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
242db070446SPavel Begunkov {
243db070446SPavel Begunkov 	int i;
244db070446SPavel Begunkov 
245db070446SPavel Begunkov 	for (i = 0; i < area->nia.num_niovs; i++) {
246db070446SPavel Begunkov 		struct net_iov *niov = &area->nia.niovs[i];
247db070446SPavel Begunkov 		dma_addr_t dma;
248db070446SPavel Begunkov 
249782dfa32SPavel Begunkov 		dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0,
250782dfa32SPavel Begunkov 					 PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR);
251db070446SPavel Begunkov 		if (dma_mapping_error(ifq->dev, dma))
252db070446SPavel Begunkov 			break;
253db070446SPavel Begunkov 		if (net_mp_niov_set_dma_addr(niov, dma)) {
254db070446SPavel Begunkov 			dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
255db070446SPavel Begunkov 					     DMA_FROM_DEVICE, IO_DMA_ATTR);
256db070446SPavel Begunkov 			break;
257db070446SPavel Begunkov 		}
258db070446SPavel Begunkov 	}
2598a628042SPavel Begunkov 	return i;
2608a628042SPavel Begunkov }
261db070446SPavel Begunkov 
io_zcrx_map_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)2628a628042SPavel Begunkov static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
2638a628042SPavel Begunkov {
2648a628042SPavel Begunkov 	unsigned nr;
2658a628042SPavel Begunkov 
2668a628042SPavel Begunkov 	guard(mutex)(&ifq->dma_lock);
2678a628042SPavel Begunkov 	if (area->is_mapped)
2688a628042SPavel Begunkov 		return 0;
2698a628042SPavel Begunkov 
270a5c98e94SPavel Begunkov 	if (area->mem.is_dmabuf)
271a5c98e94SPavel Begunkov 		nr = io_zcrx_map_area_dmabuf(ifq, area);
272a5c98e94SPavel Begunkov 	else
2738a628042SPavel Begunkov 		nr = io_zcrx_map_area_umem(ifq, area);
274a5c98e94SPavel Begunkov 
2758a628042SPavel Begunkov 	if (nr != area->nia.num_niovs) {
2768a628042SPavel Begunkov 		__io_zcrx_unmap_area(ifq, area, nr);
277db070446SPavel Begunkov 		return -EINVAL;
278db070446SPavel Begunkov 	}
279db070446SPavel Begunkov 
280db070446SPavel Begunkov 	area->is_mapped = true;
281db070446SPavel Begunkov 	return 0;
282db070446SPavel Begunkov }
283db070446SPavel Begunkov 
io_zcrx_sync_for_device(const struct page_pool * pool,struct net_iov * niov)284db070446SPavel Begunkov static void io_zcrx_sync_for_device(const struct page_pool *pool,
285db070446SPavel Begunkov 				    struct net_iov *niov)
286db070446SPavel Begunkov {
287db070446SPavel Begunkov #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
288db070446SPavel Begunkov 	dma_addr_t dma_addr;
289db070446SPavel Begunkov 
290db070446SPavel Begunkov 	if (!dma_dev_need_sync(pool->p.dev))
291db070446SPavel Begunkov 		return;
292db070446SPavel Begunkov 
293db070446SPavel Begunkov 	dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
294db070446SPavel Begunkov 	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
295db070446SPavel Begunkov 				     PAGE_SIZE, pool->p.dma_dir);
296db070446SPavel Begunkov #endif
297db070446SPavel Begunkov }
298db070446SPavel Begunkov 
2996f377873SDavid Wei #define IO_RQ_MAX_ENTRIES		32768
3006f377873SDavid Wei 
301931dfae1SPavel Begunkov #define IO_SKBS_PER_CALL_LIMIT	20
302931dfae1SPavel Begunkov 
30311ed914bSDavid Wei struct io_zcrx_args {
30411ed914bSDavid Wei 	struct io_kiocb		*req;
30511ed914bSDavid Wei 	struct io_zcrx_ifq	*ifq;
30611ed914bSDavid Wei 	struct socket		*sock;
307931dfae1SPavel Begunkov 	unsigned		nr_skbs;
30811ed914bSDavid Wei };
30911ed914bSDavid Wei 
31034a3e608SPavel Begunkov static const struct memory_provider_ops io_uring_pp_zc_ops;
31134a3e608SPavel Begunkov 
io_get_user_counter(struct net_iov * niov)31234a3e608SPavel Begunkov static inline atomic_t *io_get_user_counter(struct net_iov *niov)
31334a3e608SPavel Begunkov {
31434a3e608SPavel Begunkov 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
31534a3e608SPavel Begunkov 
31634a3e608SPavel Begunkov 	return &area->user_refs[net_iov_idx(niov)];
31734a3e608SPavel Begunkov }
31834a3e608SPavel Begunkov 
io_zcrx_put_niov_uref(struct net_iov * niov)31934a3e608SPavel Begunkov static bool io_zcrx_put_niov_uref(struct net_iov *niov)
32034a3e608SPavel Begunkov {
32134a3e608SPavel Begunkov 	atomic_t *uref = io_get_user_counter(niov);
32234a3e608SPavel Begunkov 
32334a3e608SPavel Begunkov 	if (unlikely(!atomic_read(uref)))
32434a3e608SPavel Begunkov 		return false;
32534a3e608SPavel Begunkov 	atomic_dec(uref);
32634a3e608SPavel Begunkov 	return true;
32734a3e608SPavel Begunkov }
32834a3e608SPavel Begunkov 
io_zcrx_get_niov_uref(struct net_iov * niov)32911ed914bSDavid Wei static void io_zcrx_get_niov_uref(struct net_iov *niov)
33011ed914bSDavid Wei {
33111ed914bSDavid Wei 	atomic_inc(io_get_user_counter(niov));
33211ed914bSDavid Wei }
33311ed914bSDavid Wei 
io_allocate_rbuf_ring(struct io_zcrx_ifq * ifq,struct io_uring_zcrx_ifq_reg * reg,struct io_uring_region_desc * rd,u32 id)3346f377873SDavid Wei static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
3356f377873SDavid Wei 				 struct io_uring_zcrx_ifq_reg *reg,
33676f1cc98SPavel Begunkov 				 struct io_uring_region_desc *rd,
33776f1cc98SPavel Begunkov 				 u32 id)
3386f377873SDavid Wei {
33976f1cc98SPavel Begunkov 	u64 mmap_offset;
3406f377873SDavid Wei 	size_t off, size;
3416f377873SDavid Wei 	void *ptr;
3426f377873SDavid Wei 	int ret;
3436f377873SDavid Wei 
3446f377873SDavid Wei 	off = sizeof(struct io_uring);
3456f377873SDavid Wei 	size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
3466f377873SDavid Wei 	if (size > rd->size)
3476f377873SDavid Wei 		return -EINVAL;
3486f377873SDavid Wei 
34976f1cc98SPavel Begunkov 	mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
35076f1cc98SPavel Begunkov 	mmap_offset += id << IORING_OFF_PBUF_SHIFT;
35176f1cc98SPavel Begunkov 
35276f1cc98SPavel Begunkov 	ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset);
3536f377873SDavid Wei 	if (ret < 0)
3546f377873SDavid Wei 		return ret;
3556f377873SDavid Wei 
356632b3186SPavel Begunkov 	ptr = io_region_get_ptr(&ifq->region);
3576f377873SDavid Wei 	ifq->rq_ring = (struct io_uring *)ptr;
3586f377873SDavid Wei 	ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
3596f377873SDavid Wei 	return 0;
3606f377873SDavid Wei }
3616f377873SDavid Wei 
io_free_rbuf_ring(struct io_zcrx_ifq * ifq)3626f377873SDavid Wei static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
3636f377873SDavid Wei {
364632b3186SPavel Begunkov 	io_free_region(ifq->ctx, &ifq->region);
3656f377873SDavid Wei 	ifq->rq_ring = NULL;
3666f377873SDavid Wei 	ifq->rqes = NULL;
3676f377873SDavid Wei }
3686f377873SDavid Wei 
io_zcrx_free_area(struct io_zcrx_area * area)369cf96310cSDavid Wei static void io_zcrx_free_area(struct io_zcrx_area *area)
370cf96310cSDavid Wei {
3710ec33c81SPavel Begunkov 	if (area->ifq)
372db070446SPavel Begunkov 		io_zcrx_unmap_area(area->ifq, area);
373782dfa32SPavel Begunkov 	io_release_area_mem(&area->mem);
374db070446SPavel Begunkov 
375cf96310cSDavid Wei 	kvfree(area->freelist);
376cf96310cSDavid Wei 	kvfree(area->nia.niovs);
37734a3e608SPavel Begunkov 	kvfree(area->user_refs);
378cf96310cSDavid Wei 	kfree(area);
379cf96310cSDavid Wei }
380cf96310cSDavid Wei 
381a5c98e94SPavel Begunkov #define IO_ZCRX_AREA_SUPPORTED_FLAGS	(IORING_ZCRX_AREA_DMABUF)
382a5c98e94SPavel Begunkov 
io_zcrx_create_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area ** res,struct io_uring_zcrx_area_reg * area_reg)383cf96310cSDavid Wei static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
384cf96310cSDavid Wei 			       struct io_zcrx_area **res,
385cf96310cSDavid Wei 			       struct io_uring_zcrx_area_reg *area_reg)
386cf96310cSDavid Wei {
387cf96310cSDavid Wei 	struct io_zcrx_area *area;
388782dfa32SPavel Begunkov 	unsigned nr_iovs;
389782dfa32SPavel Begunkov 	int i, ret;
390cf96310cSDavid Wei 
391a5c98e94SPavel Begunkov 	if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS)
392cf96310cSDavid Wei 		return -EINVAL;
393a5c98e94SPavel Begunkov 	if (area_reg->rq_area_token)
394a5c98e94SPavel Begunkov 		return -EINVAL;
395a5c98e94SPavel Begunkov 	if (area_reg->__resv2[0] || area_reg->__resv2[1])
396cf96310cSDavid Wei 		return -EINVAL;
397cf96310cSDavid Wei 
398cf96310cSDavid Wei 	ret = -ENOMEM;
399cf96310cSDavid Wei 	area = kzalloc(sizeof(*area), GFP_KERNEL);
400cf96310cSDavid Wei 	if (!area)
401cf96310cSDavid Wei 		goto err;
402cf96310cSDavid Wei 
403782dfa32SPavel Begunkov 	ret = io_import_area(ifq, &area->mem, area_reg);
404782dfa32SPavel Begunkov 	if (ret)
405cf96310cSDavid Wei 		goto err;
406782dfa32SPavel Begunkov 
407782dfa32SPavel Begunkov 	nr_iovs = area->mem.size >> PAGE_SHIFT;
4085a17131aSPavel Begunkov 	area->nia.num_niovs = nr_iovs;
409cf96310cSDavid Wei 
410782dfa32SPavel Begunkov 	ret = -ENOMEM;
4115a17131aSPavel Begunkov 	area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
412cf96310cSDavid Wei 					 GFP_KERNEL | __GFP_ZERO);
413cf96310cSDavid Wei 	if (!area->nia.niovs)
414cf96310cSDavid Wei 		goto err;
415cf96310cSDavid Wei 
4165a17131aSPavel Begunkov 	area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]),
417cf96310cSDavid Wei 					GFP_KERNEL | __GFP_ZERO);
418cf96310cSDavid Wei 	if (!area->freelist)
419cf96310cSDavid Wei 		goto err;
420cf96310cSDavid Wei 
4215a17131aSPavel Begunkov 	area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]),
42234a3e608SPavel Begunkov 					GFP_KERNEL | __GFP_ZERO);
42334a3e608SPavel Begunkov 	if (!area->user_refs)
42434a3e608SPavel Begunkov 		goto err;
42534a3e608SPavel Begunkov 
4265a17131aSPavel Begunkov 	for (i = 0; i < nr_iovs; i++) {
42734a3e608SPavel Begunkov 		struct net_iov *niov = &area->nia.niovs[i];
42834a3e608SPavel Begunkov 
42934a3e608SPavel Begunkov 		niov->owner = &area->nia;
43034a3e608SPavel Begunkov 		area->freelist[i] = i;
43134a3e608SPavel Begunkov 		atomic_set(&area->user_refs[i], 0);
43203e96b8cSMina Almasry 		niov->type = NET_IOV_IOURING;
43334a3e608SPavel Begunkov 	}
43434a3e608SPavel Begunkov 
4355a17131aSPavel Begunkov 	area->free_count = nr_iovs;
436cf96310cSDavid Wei 	area->ifq = ifq;
437cf96310cSDavid Wei 	/* we're only supporting one area per ifq for now */
438cf96310cSDavid Wei 	area->area_id = 0;
439cf96310cSDavid Wei 	area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
440cf96310cSDavid Wei 	spin_lock_init(&area->freelist_lock);
441cf96310cSDavid Wei 	*res = area;
442cf96310cSDavid Wei 	return 0;
443cf96310cSDavid Wei err:
444cf96310cSDavid Wei 	if (area)
445cf96310cSDavid Wei 		io_zcrx_free_area(area);
446cf96310cSDavid Wei 	return ret;
447cf96310cSDavid Wei }
448cf96310cSDavid Wei 
io_zcrx_ifq_alloc(struct io_ring_ctx * ctx)4496f377873SDavid Wei static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
4506f377873SDavid Wei {
4516f377873SDavid Wei 	struct io_zcrx_ifq *ifq;
4526f377873SDavid Wei 
4536f377873SDavid Wei 	ifq = kzalloc(sizeof(*ifq), GFP_KERNEL);
4546f377873SDavid Wei 	if (!ifq)
4556f377873SDavid Wei 		return NULL;
4566f377873SDavid Wei 
4576f377873SDavid Wei 	ifq->if_rxq = -1;
4586f377873SDavid Wei 	ifq->ctx = ctx;
459035af94bSPavel Begunkov 	spin_lock_init(&ifq->lock);
46034a3e608SPavel Begunkov 	spin_lock_init(&ifq->rq_lock);
461f12ecf5eSPavel Begunkov 	mutex_init(&ifq->dma_lock);
4626f377873SDavid Wei 	return ifq;
4636f377873SDavid Wei }
4646f377873SDavid Wei 
io_zcrx_drop_netdev(struct io_zcrx_ifq * ifq)465035af94bSPavel Begunkov static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq)
466035af94bSPavel Begunkov {
467035af94bSPavel Begunkov 	spin_lock(&ifq->lock);
468035af94bSPavel Begunkov 	if (ifq->netdev) {
469035af94bSPavel Begunkov 		netdev_put(ifq->netdev, &ifq->netdev_tracker);
470035af94bSPavel Begunkov 		ifq->netdev = NULL;
471035af94bSPavel Begunkov 	}
472035af94bSPavel Begunkov 	spin_unlock(&ifq->lock);
473035af94bSPavel Begunkov }
474035af94bSPavel Begunkov 
io_close_queue(struct io_zcrx_ifq * ifq)475e0793de2SDavid Wei static void io_close_queue(struct io_zcrx_ifq *ifq)
476e0793de2SDavid Wei {
477e0793de2SDavid Wei 	struct net_device *netdev;
478e0793de2SDavid Wei 	netdevice_tracker netdev_tracker;
479e0793de2SDavid Wei 	struct pp_memory_provider_params p = {
480e0793de2SDavid Wei 		.mp_ops = &io_uring_pp_zc_ops,
481e0793de2SDavid Wei 		.mp_priv = ifq,
482e0793de2SDavid Wei 	};
483e0793de2SDavid Wei 
484e0793de2SDavid Wei 	if (ifq->if_rxq == -1)
485e0793de2SDavid Wei 		return;
486e0793de2SDavid Wei 
487e0793de2SDavid Wei 	spin_lock(&ifq->lock);
488e0793de2SDavid Wei 	netdev = ifq->netdev;
489e0793de2SDavid Wei 	netdev_tracker = ifq->netdev_tracker;
490e0793de2SDavid Wei 	ifq->netdev = NULL;
491e0793de2SDavid Wei 	spin_unlock(&ifq->lock);
492e0793de2SDavid Wei 
493e0793de2SDavid Wei 	if (netdev) {
494e0793de2SDavid Wei 		net_mp_close_rxq(netdev, ifq->if_rxq, &p);
495e0793de2SDavid Wei 		netdev_put(netdev, &netdev_tracker);
496e0793de2SDavid Wei 	}
497e0793de2SDavid Wei 	ifq->if_rxq = -1;
498e0793de2SDavid Wei }
499e0793de2SDavid Wei 
io_zcrx_ifq_free(struct io_zcrx_ifq * ifq)5006f377873SDavid Wei static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
5016f377873SDavid Wei {
502e0793de2SDavid Wei 	io_close_queue(ifq);
503035af94bSPavel Begunkov 	io_zcrx_drop_netdev(ifq);
504035af94bSPavel Begunkov 
505cf96310cSDavid Wei 	if (ifq->area)
506cf96310cSDavid Wei 		io_zcrx_free_area(ifq->area);
507035af94bSPavel Begunkov 	if (ifq->dev)
508035af94bSPavel Begunkov 		put_device(ifq->dev);
509cf96310cSDavid Wei 
5106f377873SDavid Wei 	io_free_rbuf_ring(ifq);
511f12ecf5eSPavel Begunkov 	mutex_destroy(&ifq->dma_lock);
5126f377873SDavid Wei 	kfree(ifq);
5136f377873SDavid Wei }
5146f377873SDavid Wei 
io_zcrx_get_region(struct io_ring_ctx * ctx,unsigned int id)51577231d4eSPavel Begunkov struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
51677231d4eSPavel Begunkov 					    unsigned int id)
51777231d4eSPavel Begunkov {
51876f1cc98SPavel Begunkov 	struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id);
51976f1cc98SPavel Begunkov 
52077231d4eSPavel Begunkov 	lockdep_assert_held(&ctx->mmap_lock);
52177231d4eSPavel Begunkov 
52276f1cc98SPavel Begunkov 	return ifq ? &ifq->region : NULL;
52377231d4eSPavel Begunkov }
52477231d4eSPavel Begunkov 
io_register_zcrx_ifq(struct io_ring_ctx * ctx,struct io_uring_zcrx_ifq_reg __user * arg)5256f377873SDavid Wei int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
5266f377873SDavid Wei 			  struct io_uring_zcrx_ifq_reg __user *arg)
5276f377873SDavid Wei {
528e0793de2SDavid Wei 	struct pp_memory_provider_params mp_param = {};
529cf96310cSDavid Wei 	struct io_uring_zcrx_area_reg area;
5306f377873SDavid Wei 	struct io_uring_zcrx_ifq_reg reg;
5316f377873SDavid Wei 	struct io_uring_region_desc rd;
5326f377873SDavid Wei 	struct io_zcrx_ifq *ifq;
5336f377873SDavid Wei 	int ret;
53476f1cc98SPavel Begunkov 	u32 id;
5356f377873SDavid Wei 
5366f377873SDavid Wei 	/*
5376f377873SDavid Wei 	 * 1. Interface queue allocation.
5386f377873SDavid Wei 	 * 2. It can observe data destined for sockets of other tasks.
5396f377873SDavid Wei 	 */
5406f377873SDavid Wei 	if (!capable(CAP_NET_ADMIN))
5416f377873SDavid Wei 		return -EPERM;
5426f377873SDavid Wei 
5436f377873SDavid Wei 	/* mandatory io_uring features for zc rx */
5446f377873SDavid Wei 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
5456f377873SDavid Wei 	      ctx->flags & IORING_SETUP_CQE32))
5466f377873SDavid Wei 		return -EINVAL;
5476f377873SDavid Wei 	if (copy_from_user(&reg, arg, sizeof(reg)))
5486f377873SDavid Wei 		return -EFAULT;
5496f377873SDavid Wei 	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
5506f377873SDavid Wei 		return -EFAULT;
55125744f84SPavel Begunkov 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)) ||
55225744f84SPavel Begunkov 	    reg.__resv2 || reg.zcrx_id)
5536f377873SDavid Wei 		return -EINVAL;
5546f377873SDavid Wei 	if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
5556f377873SDavid Wei 		return -EINVAL;
5566f377873SDavid Wei 	if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
5576f377873SDavid Wei 		if (!(ctx->flags & IORING_SETUP_CLAMP))
5586f377873SDavid Wei 			return -EINVAL;
5596f377873SDavid Wei 		reg.rq_entries = IO_RQ_MAX_ENTRIES;
5606f377873SDavid Wei 	}
5616f377873SDavid Wei 	reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
5626f377873SDavid Wei 
563cf96310cSDavid Wei 	if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area)))
5646f377873SDavid Wei 		return -EFAULT;
5656f377873SDavid Wei 
5666f377873SDavid Wei 	ifq = io_zcrx_ifq_alloc(ctx);
5676f377873SDavid Wei 	if (!ifq)
5686f377873SDavid Wei 		return -ENOMEM;
5696c9589aaSPavel Begunkov 	ifq->rq_entries = reg.rq_entries;
5706f377873SDavid Wei 
57176f1cc98SPavel Begunkov 	scoped_guard(mutex, &ctx->mmap_lock) {
57276f1cc98SPavel Begunkov 		/* preallocate id */
57376f1cc98SPavel Begunkov 		ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
57476f1cc98SPavel Begunkov 		if (ret)
57576f1cc98SPavel Begunkov 			goto ifq_free;
57676f1cc98SPavel Begunkov 	}
57776f1cc98SPavel Begunkov 
57876f1cc98SPavel Begunkov 	ret = io_allocate_rbuf_ring(ifq, &reg, &rd, id);
5796f377873SDavid Wei 	if (ret)
5806f377873SDavid Wei 		goto err;
5816f377873SDavid Wei 
5826c9589aaSPavel Begunkov 	ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx,
5836c9589aaSPavel Begunkov 					  &ifq->netdev_tracker, GFP_KERNEL);
5846c9589aaSPavel Begunkov 	if (!ifq->netdev) {
5856c9589aaSPavel Begunkov 		ret = -ENODEV;
5866c9589aaSPavel Begunkov 		goto err;
5876c9589aaSPavel Begunkov 	}
5886c9589aaSPavel Begunkov 
5896c9589aaSPavel Begunkov 	ifq->dev = ifq->netdev->dev.parent;
5906c9589aaSPavel Begunkov 	if (!ifq->dev) {
5916c9589aaSPavel Begunkov 		ret = -EOPNOTSUPP;
5926c9589aaSPavel Begunkov 		goto err;
5936c9589aaSPavel Begunkov 	}
5946c9589aaSPavel Begunkov 	get_device(ifq->dev);
5956c9589aaSPavel Begunkov 
596cf96310cSDavid Wei 	ret = io_zcrx_create_area(ifq, &ifq->area, &area);
597cf96310cSDavid Wei 	if (ret)
598cf96310cSDavid Wei 		goto err;
599cf96310cSDavid Wei 
600e0793de2SDavid Wei 	mp_param.mp_ops = &io_uring_pp_zc_ops;
601e0793de2SDavid Wei 	mp_param.mp_priv = ifq;
602e0793de2SDavid Wei 	ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);
603e0793de2SDavid Wei 	if (ret)
604e0793de2SDavid Wei 		goto err;
605e0793de2SDavid Wei 	ifq->if_rxq = reg.if_rxq;
606e0793de2SDavid Wei 
6076f377873SDavid Wei 	reg.offsets.rqes = sizeof(struct io_uring);
6086f377873SDavid Wei 	reg.offsets.head = offsetof(struct io_uring, head);
6096f377873SDavid Wei 	reg.offsets.tail = offsetof(struct io_uring, tail);
61076f1cc98SPavel Begunkov 	reg.zcrx_id = id;
61176f1cc98SPavel Begunkov 
61276f1cc98SPavel Begunkov 	scoped_guard(mutex, &ctx->mmap_lock) {
61376f1cc98SPavel Begunkov 		/* publish ifq */
61476f1cc98SPavel Begunkov 		ret = -ENOMEM;
61576f1cc98SPavel Begunkov 		if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
61676f1cc98SPavel Begunkov 			goto err;
61776f1cc98SPavel Begunkov 	}
6186f377873SDavid Wei 
6196f377873SDavid Wei 	if (copy_to_user(arg, &reg, sizeof(reg)) ||
620e0793de2SDavid Wei 	    copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
621e0793de2SDavid Wei 	    copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) {
622cf96310cSDavid Wei 		ret = -EFAULT;
623cf96310cSDavid Wei 		goto err;
624cf96310cSDavid Wei 	}
6256f377873SDavid Wei 	return 0;
6266f377873SDavid Wei err:
62776f1cc98SPavel Begunkov 	scoped_guard(mutex, &ctx->mmap_lock)
62876f1cc98SPavel Begunkov 		xa_erase(&ctx->zcrx_ctxs, id);
62976f1cc98SPavel Begunkov ifq_free:
6306f377873SDavid Wei 	io_zcrx_ifq_free(ifq);
6316f377873SDavid Wei 	return ret;
6326f377873SDavid Wei }
6336f377873SDavid Wei 
io_unregister_zcrx_ifqs(struct io_ring_ctx * ctx)6346f377873SDavid Wei void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
6356f377873SDavid Wei {
63676f1cc98SPavel Begunkov 	struct io_zcrx_ifq *ifq;
6376f377873SDavid Wei 
6386f377873SDavid Wei 	lockdep_assert_held(&ctx->uring_lock);
6396f377873SDavid Wei 
64076f1cc98SPavel Begunkov 	while (1) {
64176f1cc98SPavel Begunkov 		scoped_guard(mutex, &ctx->mmap_lock) {
642eda4623cSPavel Begunkov 			unsigned long id = 0;
643eda4623cSPavel Begunkov 
64476f1cc98SPavel Begunkov 			ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
64576f1cc98SPavel Begunkov 			if (ifq)
64676f1cc98SPavel Begunkov 				xa_erase(&ctx->zcrx_ctxs, id);
64776f1cc98SPavel Begunkov 		}
6486f377873SDavid Wei 		if (!ifq)
64976f1cc98SPavel Begunkov 			break;
6506f377873SDavid Wei 		io_zcrx_ifq_free(ifq);
6516f377873SDavid Wei 	}
6526f377873SDavid Wei 
65376f1cc98SPavel Begunkov 	xa_destroy(&ctx->zcrx_ctxs);
65476f1cc98SPavel Begunkov }
65576f1cc98SPavel Begunkov 
__io_zcrx_get_free_niov(struct io_zcrx_area * area)65634a3e608SPavel Begunkov static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
65734a3e608SPavel Begunkov {
65834a3e608SPavel Begunkov 	unsigned niov_idx;
65934a3e608SPavel Begunkov 
66034a3e608SPavel Begunkov 	lockdep_assert_held(&area->freelist_lock);
66134a3e608SPavel Begunkov 
66234a3e608SPavel Begunkov 	niov_idx = area->freelist[--area->free_count];
66334a3e608SPavel Begunkov 	return &area->nia.niovs[niov_idx];
66434a3e608SPavel Begunkov }
66534a3e608SPavel Begunkov 
io_zcrx_return_niov_freelist(struct net_iov * niov)66634a3e608SPavel Begunkov static void io_zcrx_return_niov_freelist(struct net_iov *niov)
66734a3e608SPavel Begunkov {
66834a3e608SPavel Begunkov 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
66934a3e608SPavel Begunkov 
67034a3e608SPavel Begunkov 	spin_lock_bh(&area->freelist_lock);
67134a3e608SPavel Begunkov 	area->freelist[area->free_count++] = net_iov_idx(niov);
67234a3e608SPavel Begunkov 	spin_unlock_bh(&area->freelist_lock);
67334a3e608SPavel Begunkov }
67434a3e608SPavel Begunkov 
io_zcrx_return_niov(struct net_iov * niov)67534a3e608SPavel Begunkov static void io_zcrx_return_niov(struct net_iov *niov)
67634a3e608SPavel Begunkov {
67734a3e608SPavel Begunkov 	netmem_ref netmem = net_iov_to_netmem(niov);
67834a3e608SPavel Begunkov 
679bc57c7d3SPavel Begunkov 	if (!niov->pp) {
680bc57c7d3SPavel Begunkov 		/* copy fallback allocated niovs */
681bc57c7d3SPavel Begunkov 		io_zcrx_return_niov_freelist(niov);
682bc57c7d3SPavel Begunkov 		return;
683bc57c7d3SPavel Begunkov 	}
68434a3e608SPavel Begunkov 	page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false);
68534a3e608SPavel Begunkov }
68634a3e608SPavel Begunkov 
io_zcrx_scrub(struct io_zcrx_ifq * ifq)68734a3e608SPavel Begunkov static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
68834a3e608SPavel Begunkov {
68934a3e608SPavel Begunkov 	struct io_zcrx_area *area = ifq->area;
69034a3e608SPavel Begunkov 	int i;
69134a3e608SPavel Begunkov 
69234a3e608SPavel Begunkov 	if (!area)
69334a3e608SPavel Begunkov 		return;
69434a3e608SPavel Begunkov 
69534a3e608SPavel Begunkov 	/* Reclaim back all buffers given to the user space. */
69634a3e608SPavel Begunkov 	for (i = 0; i < area->nia.num_niovs; i++) {
69734a3e608SPavel Begunkov 		struct net_iov *niov = &area->nia.niovs[i];
69834a3e608SPavel Begunkov 		int nr;
69934a3e608SPavel Begunkov 
70034a3e608SPavel Begunkov 		if (!atomic_read(io_get_user_counter(niov)))
70134a3e608SPavel Begunkov 			continue;
70234a3e608SPavel Begunkov 		nr = atomic_xchg(io_get_user_counter(niov), 0);
70334a3e608SPavel Begunkov 		if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
70434a3e608SPavel Begunkov 			io_zcrx_return_niov(niov);
70534a3e608SPavel Begunkov 	}
70634a3e608SPavel Begunkov }
70734a3e608SPavel Begunkov 
io_shutdown_zcrx_ifqs(struct io_ring_ctx * ctx)7086f377873SDavid Wei void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
7096f377873SDavid Wei {
71076f1cc98SPavel Begunkov 	struct io_zcrx_ifq *ifq;
71176f1cc98SPavel Begunkov 	unsigned long index;
71276f1cc98SPavel Begunkov 
7136f377873SDavid Wei 	lockdep_assert_held(&ctx->uring_lock);
71434a3e608SPavel Begunkov 
71576f1cc98SPavel Begunkov 	xa_for_each(&ctx->zcrx_ctxs, index, ifq) {
71676f1cc98SPavel Begunkov 		io_zcrx_scrub(ifq);
71776f1cc98SPavel Begunkov 		io_close_queue(ifq);
71876f1cc98SPavel Begunkov 	}
7196f377873SDavid Wei }
72034a3e608SPavel Begunkov 
io_zcrx_rqring_entries(struct io_zcrx_ifq * ifq)72134a3e608SPavel Begunkov static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
72234a3e608SPavel Begunkov {
72334a3e608SPavel Begunkov 	u32 entries;
72434a3e608SPavel Begunkov 
72534a3e608SPavel Begunkov 	entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
72634a3e608SPavel Begunkov 	return min(entries, ifq->rq_entries);
72734a3e608SPavel Begunkov }
72834a3e608SPavel Begunkov 
io_zcrx_get_rqe(struct io_zcrx_ifq * ifq,unsigned mask)72934a3e608SPavel Begunkov static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
73034a3e608SPavel Begunkov 						 unsigned mask)
73134a3e608SPavel Begunkov {
73234a3e608SPavel Begunkov 	unsigned int idx = ifq->cached_rq_head++ & mask;
73334a3e608SPavel Begunkov 
73434a3e608SPavel Begunkov 	return &ifq->rqes[idx];
73534a3e608SPavel Begunkov }
73634a3e608SPavel Begunkov 
io_zcrx_ring_refill(struct page_pool * pp,struct io_zcrx_ifq * ifq)73734a3e608SPavel Begunkov static void io_zcrx_ring_refill(struct page_pool *pp,
73834a3e608SPavel Begunkov 				struct io_zcrx_ifq *ifq)
73934a3e608SPavel Begunkov {
74034a3e608SPavel Begunkov 	unsigned int mask = ifq->rq_entries - 1;
74134a3e608SPavel Begunkov 	unsigned int entries;
74234a3e608SPavel Begunkov 	netmem_ref netmem;
74334a3e608SPavel Begunkov 
74434a3e608SPavel Begunkov 	spin_lock_bh(&ifq->rq_lock);
74534a3e608SPavel Begunkov 
74634a3e608SPavel Begunkov 	entries = io_zcrx_rqring_entries(ifq);
74734a3e608SPavel Begunkov 	entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count);
74834a3e608SPavel Begunkov 	if (unlikely(!entries)) {
74934a3e608SPavel Begunkov 		spin_unlock_bh(&ifq->rq_lock);
75034a3e608SPavel Begunkov 		return;
75134a3e608SPavel Begunkov 	}
75234a3e608SPavel Begunkov 
75334a3e608SPavel Begunkov 	do {
75434a3e608SPavel Begunkov 		struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
75534a3e608SPavel Begunkov 		struct io_zcrx_area *area;
75634a3e608SPavel Begunkov 		struct net_iov *niov;
75734a3e608SPavel Begunkov 		unsigned niov_idx, area_idx;
75834a3e608SPavel Begunkov 
75934a3e608SPavel Begunkov 		area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT;
76034a3e608SPavel Begunkov 		niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT;
76134a3e608SPavel Begunkov 
76234a3e608SPavel Begunkov 		if (unlikely(rqe->__pad || area_idx))
76334a3e608SPavel Begunkov 			continue;
76434a3e608SPavel Begunkov 		area = ifq->area;
76534a3e608SPavel Begunkov 
76634a3e608SPavel Begunkov 		if (unlikely(niov_idx >= area->nia.num_niovs))
76734a3e608SPavel Begunkov 			continue;
76834a3e608SPavel Begunkov 		niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
76934a3e608SPavel Begunkov 
77034a3e608SPavel Begunkov 		niov = &area->nia.niovs[niov_idx];
77134a3e608SPavel Begunkov 		if (!io_zcrx_put_niov_uref(niov))
77234a3e608SPavel Begunkov 			continue;
77334a3e608SPavel Begunkov 
77434a3e608SPavel Begunkov 		netmem = net_iov_to_netmem(niov);
77534a3e608SPavel Begunkov 		if (page_pool_unref_netmem(netmem, 1) != 0)
77634a3e608SPavel Begunkov 			continue;
77734a3e608SPavel Begunkov 
77834a3e608SPavel Begunkov 		if (unlikely(niov->pp != pp)) {
77934a3e608SPavel Begunkov 			io_zcrx_return_niov(niov);
78034a3e608SPavel Begunkov 			continue;
78134a3e608SPavel Begunkov 		}
78234a3e608SPavel Begunkov 
783db070446SPavel Begunkov 		io_zcrx_sync_for_device(pp, niov);
78434a3e608SPavel Begunkov 		net_mp_netmem_place_in_cache(pp, netmem);
78534a3e608SPavel Begunkov 	} while (--entries);
78634a3e608SPavel Begunkov 
78734a3e608SPavel Begunkov 	smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
78834a3e608SPavel Begunkov 	spin_unlock_bh(&ifq->rq_lock);
78934a3e608SPavel Begunkov }
79034a3e608SPavel Begunkov 
io_zcrx_refill_slow(struct page_pool * pp,struct io_zcrx_ifq * ifq)79134a3e608SPavel Begunkov static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
79234a3e608SPavel Begunkov {
79334a3e608SPavel Begunkov 	struct io_zcrx_area *area = ifq->area;
79434a3e608SPavel Begunkov 
79534a3e608SPavel Begunkov 	spin_lock_bh(&area->freelist_lock);
79634a3e608SPavel Begunkov 	while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
79734a3e608SPavel Begunkov 		struct net_iov *niov = __io_zcrx_get_free_niov(area);
79834a3e608SPavel Begunkov 		netmem_ref netmem = net_iov_to_netmem(niov);
79934a3e608SPavel Begunkov 
80034a3e608SPavel Begunkov 		net_mp_niov_set_page_pool(pp, niov);
801db070446SPavel Begunkov 		io_zcrx_sync_for_device(pp, niov);
80234a3e608SPavel Begunkov 		net_mp_netmem_place_in_cache(pp, netmem);
80334a3e608SPavel Begunkov 	}
80434a3e608SPavel Begunkov 	spin_unlock_bh(&area->freelist_lock);
80534a3e608SPavel Begunkov }
80634a3e608SPavel Begunkov 
io_pp_zc_alloc_netmems(struct page_pool * pp,gfp_t gfp)80734a3e608SPavel Begunkov static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
80834a3e608SPavel Begunkov {
80970e4f9bfSPavel Begunkov 	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
81034a3e608SPavel Begunkov 
81134a3e608SPavel Begunkov 	/* pp should already be ensuring that */
81234a3e608SPavel Begunkov 	if (unlikely(pp->alloc.count))
81334a3e608SPavel Begunkov 		goto out_return;
81434a3e608SPavel Begunkov 
81534a3e608SPavel Begunkov 	io_zcrx_ring_refill(pp, ifq);
81634a3e608SPavel Begunkov 	if (likely(pp->alloc.count))
81734a3e608SPavel Begunkov 		goto out_return;
81834a3e608SPavel Begunkov 
81934a3e608SPavel Begunkov 	io_zcrx_refill_slow(pp, ifq);
82034a3e608SPavel Begunkov 	if (!pp->alloc.count)
82134a3e608SPavel Begunkov 		return 0;
82234a3e608SPavel Begunkov out_return:
82334a3e608SPavel Begunkov 	return pp->alloc.cache[--pp->alloc.count];
82434a3e608SPavel Begunkov }
82534a3e608SPavel Begunkov 
io_pp_zc_release_netmem(struct page_pool * pp,netmem_ref netmem)82634a3e608SPavel Begunkov static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
82734a3e608SPavel Begunkov {
82834a3e608SPavel Begunkov 	struct net_iov *niov;
82934a3e608SPavel Begunkov 
83034a3e608SPavel Begunkov 	if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
83134a3e608SPavel Begunkov 		return false;
83234a3e608SPavel Begunkov 
83334a3e608SPavel Begunkov 	niov = netmem_to_net_iov(netmem);
83434a3e608SPavel Begunkov 	net_mp_niov_clear_page_pool(niov);
83534a3e608SPavel Begunkov 	io_zcrx_return_niov_freelist(niov);
83634a3e608SPavel Begunkov 	return false;
83734a3e608SPavel Begunkov }
83834a3e608SPavel Begunkov 
io_pp_zc_init(struct page_pool * pp)83934a3e608SPavel Begunkov static int io_pp_zc_init(struct page_pool *pp)
84034a3e608SPavel Begunkov {
84170e4f9bfSPavel Begunkov 	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
842f12ecf5eSPavel Begunkov 	int ret;
84334a3e608SPavel Begunkov 
84434a3e608SPavel Begunkov 	if (WARN_ON_ONCE(!ifq))
84534a3e608SPavel Begunkov 		return -EINVAL;
846db070446SPavel Begunkov 	if (WARN_ON_ONCE(ifq->dev != pp->p.dev))
847db070446SPavel Begunkov 		return -EINVAL;
848db070446SPavel Begunkov 	if (WARN_ON_ONCE(!pp->dma_map))
84934a3e608SPavel Begunkov 		return -EOPNOTSUPP;
85034a3e608SPavel Begunkov 	if (pp->p.order != 0)
85134a3e608SPavel Begunkov 		return -EOPNOTSUPP;
852db070446SPavel Begunkov 	if (pp->p.dma_dir != DMA_FROM_DEVICE)
853db070446SPavel Begunkov 		return -EOPNOTSUPP;
85434a3e608SPavel Begunkov 
855f12ecf5eSPavel Begunkov 	ret = io_zcrx_map_area(ifq, ifq->area);
856f12ecf5eSPavel Begunkov 	if (ret)
857f12ecf5eSPavel Begunkov 		return ret;
858f12ecf5eSPavel Begunkov 
85934a3e608SPavel Begunkov 	percpu_ref_get(&ifq->ctx->refs);
86034a3e608SPavel Begunkov 	return 0;
86134a3e608SPavel Begunkov }
86234a3e608SPavel Begunkov 
io_pp_zc_destroy(struct page_pool * pp)86334a3e608SPavel Begunkov static void io_pp_zc_destroy(struct page_pool *pp)
86434a3e608SPavel Begunkov {
86570e4f9bfSPavel Begunkov 	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
86634a3e608SPavel Begunkov 	struct io_zcrx_area *area = ifq->area;
86734a3e608SPavel Begunkov 
86834a3e608SPavel Begunkov 	if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
86934a3e608SPavel Begunkov 		return;
87034a3e608SPavel Begunkov 	percpu_ref_put(&ifq->ctx->refs);
87134a3e608SPavel Begunkov }
87234a3e608SPavel Begunkov 
io_pp_nl_fill(void * mp_priv,struct sk_buff * rsp,struct netdev_rx_queue * rxq)87334a3e608SPavel Begunkov static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp,
87434a3e608SPavel Begunkov 			 struct netdev_rx_queue *rxq)
87534a3e608SPavel Begunkov {
87634a3e608SPavel Begunkov 	struct nlattr *nest;
87734a3e608SPavel Begunkov 	int type;
87834a3e608SPavel Begunkov 
87934a3e608SPavel Begunkov 	type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING;
88034a3e608SPavel Begunkov 	nest = nla_nest_start(rsp, type);
88134a3e608SPavel Begunkov 	if (!nest)
88234a3e608SPavel Begunkov 		return -EMSGSIZE;
88334a3e608SPavel Begunkov 	nla_nest_end(rsp, nest);
88434a3e608SPavel Begunkov 
88534a3e608SPavel Begunkov 	return 0;
88634a3e608SPavel Begunkov }
88734a3e608SPavel Begunkov 
io_pp_uninstall(void * mp_priv,struct netdev_rx_queue * rxq)88834a3e608SPavel Begunkov static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
88934a3e608SPavel Begunkov {
89034a3e608SPavel Begunkov 	struct pp_memory_provider_params *p = &rxq->mp_params;
89134a3e608SPavel Begunkov 	struct io_zcrx_ifq *ifq = mp_priv;
89234a3e608SPavel Begunkov 
89334a3e608SPavel Begunkov 	io_zcrx_drop_netdev(ifq);
894f12ecf5eSPavel Begunkov 	if (ifq->area)
895f12ecf5eSPavel Begunkov 		io_zcrx_unmap_area(ifq, ifq->area);
896f12ecf5eSPavel Begunkov 
89734a3e608SPavel Begunkov 	p->mp_ops = NULL;
89834a3e608SPavel Begunkov 	p->mp_priv = NULL;
89934a3e608SPavel Begunkov }
90034a3e608SPavel Begunkov 
90134a3e608SPavel Begunkov static const struct memory_provider_ops io_uring_pp_zc_ops = {
90234a3e608SPavel Begunkov 	.alloc_netmems		= io_pp_zc_alloc_netmems,
90334a3e608SPavel Begunkov 	.release_netmem		= io_pp_zc_release_netmem,
90434a3e608SPavel Begunkov 	.init			= io_pp_zc_init,
90534a3e608SPavel Begunkov 	.destroy		= io_pp_zc_destroy,
90634a3e608SPavel Begunkov 	.nl_fill		= io_pp_nl_fill,
90734a3e608SPavel Begunkov 	.uninstall		= io_pp_uninstall,
90834a3e608SPavel Begunkov };
90911ed914bSDavid Wei 
io_zcrx_queue_cqe(struct io_kiocb * req,struct net_iov * niov,struct io_zcrx_ifq * ifq,int off,int len)91011ed914bSDavid Wei static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
91111ed914bSDavid Wei 			      struct io_zcrx_ifq *ifq, int off, int len)
91211ed914bSDavid Wei {
91311ed914bSDavid Wei 	struct io_uring_zcrx_cqe *rcqe;
91411ed914bSDavid Wei 	struct io_zcrx_area *area;
91511ed914bSDavid Wei 	struct io_uring_cqe *cqe;
91611ed914bSDavid Wei 	u64 offset;
91711ed914bSDavid Wei 
91811ed914bSDavid Wei 	if (!io_defer_get_uncommited_cqe(req->ctx, &cqe))
91911ed914bSDavid Wei 		return false;
92011ed914bSDavid Wei 
92111ed914bSDavid Wei 	cqe->user_data = req->cqe.user_data;
92211ed914bSDavid Wei 	cqe->res = len;
92311ed914bSDavid Wei 	cqe->flags = IORING_CQE_F_MORE;
92411ed914bSDavid Wei 
92511ed914bSDavid Wei 	area = io_zcrx_iov_to_area(niov);
92611ed914bSDavid Wei 	offset = off + (net_iov_idx(niov) << PAGE_SHIFT);
92711ed914bSDavid Wei 	rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1);
92811ed914bSDavid Wei 	rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT);
92911ed914bSDavid Wei 	rcqe->__pad = 0;
93011ed914bSDavid Wei 	return true;
93111ed914bSDavid Wei }
93211ed914bSDavid Wei 
io_zcrx_alloc_fallback(struct io_zcrx_area * area)933bc57c7d3SPavel Begunkov static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area)
934bc57c7d3SPavel Begunkov {
935bc57c7d3SPavel Begunkov 	struct net_iov *niov = NULL;
936bc57c7d3SPavel Begunkov 
937bc57c7d3SPavel Begunkov 	spin_lock_bh(&area->freelist_lock);
938bc57c7d3SPavel Begunkov 	if (area->free_count)
939bc57c7d3SPavel Begunkov 		niov = __io_zcrx_get_free_niov(area);
940bc57c7d3SPavel Begunkov 	spin_unlock_bh(&area->freelist_lock);
941bc57c7d3SPavel Begunkov 
942bc57c7d3SPavel Begunkov 	if (niov)
943bc57c7d3SPavel Begunkov 		page_pool_fragment_netmem(net_iov_to_netmem(niov), 1);
944bc57c7d3SPavel Begunkov 	return niov;
945bc57c7d3SPavel Begunkov }
946bc57c7d3SPavel Begunkov 
io_zcrx_copy_chunk(struct io_kiocb * req,struct io_zcrx_ifq * ifq,void * src_base,struct page * src_page,unsigned int src_offset,size_t len)947bc57c7d3SPavel Begunkov static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
948bc57c7d3SPavel Begunkov 				  void *src_base, struct page *src_page,
949bc57c7d3SPavel Begunkov 				  unsigned int src_offset, size_t len)
950bc57c7d3SPavel Begunkov {
951bc57c7d3SPavel Begunkov 	struct io_zcrx_area *area = ifq->area;
952bc57c7d3SPavel Begunkov 	size_t copied = 0;
953bc57c7d3SPavel Begunkov 	int ret = 0;
954bc57c7d3SPavel Begunkov 
955a5c98e94SPavel Begunkov 	if (area->mem.is_dmabuf)
956a5c98e94SPavel Begunkov 		return -EFAULT;
957a5c98e94SPavel Begunkov 
958bc57c7d3SPavel Begunkov 	while (len) {
959bc57c7d3SPavel Begunkov 		size_t copy_size = min_t(size_t, PAGE_SIZE, len);
960bc57c7d3SPavel Begunkov 		const int dst_off = 0;
961bc57c7d3SPavel Begunkov 		struct net_iov *niov;
962bc57c7d3SPavel Begunkov 		struct page *dst_page;
963bc57c7d3SPavel Begunkov 		void *dst_addr;
964bc57c7d3SPavel Begunkov 
965bc57c7d3SPavel Begunkov 		niov = io_zcrx_alloc_fallback(area);
966bc57c7d3SPavel Begunkov 		if (!niov) {
967bc57c7d3SPavel Begunkov 			ret = -ENOMEM;
968bc57c7d3SPavel Begunkov 			break;
969bc57c7d3SPavel Begunkov 		}
970bc57c7d3SPavel Begunkov 
971bc57c7d3SPavel Begunkov 		dst_page = io_zcrx_iov_page(niov);
972bc57c7d3SPavel Begunkov 		dst_addr = kmap_local_page(dst_page);
973bc57c7d3SPavel Begunkov 		if (src_page)
974bc57c7d3SPavel Begunkov 			src_base = kmap_local_page(src_page);
975bc57c7d3SPavel Begunkov 
976bc57c7d3SPavel Begunkov 		memcpy(dst_addr, src_base + src_offset, copy_size);
977bc57c7d3SPavel Begunkov 
978bc57c7d3SPavel Begunkov 		if (src_page)
979bc57c7d3SPavel Begunkov 			kunmap_local(src_base);
980bc57c7d3SPavel Begunkov 		kunmap_local(dst_addr);
981bc57c7d3SPavel Begunkov 
982bc57c7d3SPavel Begunkov 		if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) {
983bc57c7d3SPavel Begunkov 			io_zcrx_return_niov(niov);
984bc57c7d3SPavel Begunkov 			ret = -ENOSPC;
985bc57c7d3SPavel Begunkov 			break;
986bc57c7d3SPavel Begunkov 		}
987bc57c7d3SPavel Begunkov 
988bc57c7d3SPavel Begunkov 		io_zcrx_get_niov_uref(niov);
989bc57c7d3SPavel Begunkov 		src_offset += copy_size;
990bc57c7d3SPavel Begunkov 		len -= copy_size;
991bc57c7d3SPavel Begunkov 		copied += copy_size;
992bc57c7d3SPavel Begunkov 	}
993bc57c7d3SPavel Begunkov 
994bc57c7d3SPavel Begunkov 	return copied ? copied : ret;
995bc57c7d3SPavel Begunkov }
996bc57c7d3SPavel Begunkov 
io_zcrx_copy_frag(struct io_kiocb * req,struct io_zcrx_ifq * ifq,const skb_frag_t * frag,int off,int len)997bc57c7d3SPavel Begunkov static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
998bc57c7d3SPavel Begunkov 			     const skb_frag_t *frag, int off, int len)
999bc57c7d3SPavel Begunkov {
1000bc57c7d3SPavel Begunkov 	struct page *page = skb_frag_page(frag);
1001bc57c7d3SPavel Begunkov 	u32 p_off, p_len, t, copied = 0;
1002bc57c7d3SPavel Begunkov 	int ret = 0;
1003bc57c7d3SPavel Begunkov 
1004bc57c7d3SPavel Begunkov 	off += skb_frag_off(frag);
1005bc57c7d3SPavel Begunkov 
1006bc57c7d3SPavel Begunkov 	skb_frag_foreach_page(frag, off, len,
1007bc57c7d3SPavel Begunkov 			      page, p_off, p_len, t) {
1008bc57c7d3SPavel Begunkov 		ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len);
1009bc57c7d3SPavel Begunkov 		if (ret < 0)
1010bc57c7d3SPavel Begunkov 			return copied ? copied : ret;
1011bc57c7d3SPavel Begunkov 		copied += ret;
1012bc57c7d3SPavel Begunkov 	}
1013bc57c7d3SPavel Begunkov 	return copied;
1014bc57c7d3SPavel Begunkov }
1015bc57c7d3SPavel Begunkov 
io_zcrx_recv_frag(struct io_kiocb * req,struct io_zcrx_ifq * ifq,const skb_frag_t * frag,int off,int len)101611ed914bSDavid Wei static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
101711ed914bSDavid Wei 			     const skb_frag_t *frag, int off, int len)
101811ed914bSDavid Wei {
101911ed914bSDavid Wei 	struct net_iov *niov;
102011ed914bSDavid Wei 
102111ed914bSDavid Wei 	if (unlikely(!skb_frag_is_net_iov(frag)))
1022bc57c7d3SPavel Begunkov 		return io_zcrx_copy_frag(req, ifq, frag, off, len);
102311ed914bSDavid Wei 
102411ed914bSDavid Wei 	niov = netmem_to_net_iov(frag->netmem);
1025bd618489SMina Almasry 	if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops ||
102670e4f9bfSPavel Begunkov 	    io_pp_to_ifq(niov->pp) != ifq)
102711ed914bSDavid Wei 		return -EFAULT;
102811ed914bSDavid Wei 
102911ed914bSDavid Wei 	if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
103011ed914bSDavid Wei 		return -ENOSPC;
103111ed914bSDavid Wei 
103211ed914bSDavid Wei 	/*
103311ed914bSDavid Wei 	 * Prevent it from being recycled while user is accessing it.
103411ed914bSDavid Wei 	 * It has to be done before grabbing a user reference.
103511ed914bSDavid Wei 	 */
103611ed914bSDavid Wei 	page_pool_ref_netmem(net_iov_to_netmem(niov));
103711ed914bSDavid Wei 	io_zcrx_get_niov_uref(niov);
103811ed914bSDavid Wei 	return len;
103911ed914bSDavid Wei }
104011ed914bSDavid Wei 
104111ed914bSDavid Wei static int
io_zcrx_recv_skb(read_descriptor_t * desc,struct sk_buff * skb,unsigned int offset,size_t len)104211ed914bSDavid Wei io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
104311ed914bSDavid Wei 		 unsigned int offset, size_t len)
104411ed914bSDavid Wei {
104511ed914bSDavid Wei 	struct io_zcrx_args *args = desc->arg.data;
104611ed914bSDavid Wei 	struct io_zcrx_ifq *ifq = args->ifq;
104711ed914bSDavid Wei 	struct io_kiocb *req = args->req;
104811ed914bSDavid Wei 	struct sk_buff *frag_iter;
1049bc57c7d3SPavel Begunkov 	unsigned start, start_off = offset;
105011ed914bSDavid Wei 	int i, copy, end, off;
105111ed914bSDavid Wei 	int ret = 0;
105211ed914bSDavid Wei 
10536699ec9aSDavid Wei 	len = min_t(size_t, len, desc->count);
1054fcfd94d6SDavid Wei 	/*
1055fcfd94d6SDavid Wei 	 * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even
1056fcfd94d6SDavid Wei 	 * if desc->count is already 0. This is caused by the if (offset + 1 !=
1057fcfd94d6SDavid Wei 	 * skb->len) check. Return early in this case to break out of
1058fcfd94d6SDavid Wei 	 * __tcp_read_sock().
1059fcfd94d6SDavid Wei 	 */
1060fcfd94d6SDavid Wei 	if (!len)
1061fcfd94d6SDavid Wei 		return 0;
1062931dfae1SPavel Begunkov 	if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT))
1063931dfae1SPavel Begunkov 		return -EAGAIN;
1064931dfae1SPavel Begunkov 
1065bc57c7d3SPavel Begunkov 	if (unlikely(offset < skb_headlen(skb))) {
1066bc57c7d3SPavel Begunkov 		ssize_t copied;
1067bc57c7d3SPavel Begunkov 		size_t to_copy;
106811ed914bSDavid Wei 
1069bc57c7d3SPavel Begunkov 		to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
1070bc57c7d3SPavel Begunkov 		copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL,
1071bc57c7d3SPavel Begunkov 					    offset, to_copy);
1072bc57c7d3SPavel Begunkov 		if (copied < 0) {
1073bc57c7d3SPavel Begunkov 			ret = copied;
1074bc57c7d3SPavel Begunkov 			goto out;
1075bc57c7d3SPavel Begunkov 		}
1076bc57c7d3SPavel Begunkov 		offset += copied;
1077bc57c7d3SPavel Begunkov 		len -= copied;
1078bc57c7d3SPavel Begunkov 		if (!len)
1079bc57c7d3SPavel Begunkov 			goto out;
1080bc57c7d3SPavel Begunkov 		if (offset != skb_headlen(skb))
1081bc57c7d3SPavel Begunkov 			goto out;
1082bc57c7d3SPavel Begunkov 	}
1083bc57c7d3SPavel Begunkov 
1084bc57c7d3SPavel Begunkov 	start = skb_headlen(skb);
108511ed914bSDavid Wei 
108611ed914bSDavid Wei 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
108711ed914bSDavid Wei 		const skb_frag_t *frag;
108811ed914bSDavid Wei 
108911ed914bSDavid Wei 		if (WARN_ON(start > offset + len))
109011ed914bSDavid Wei 			return -EFAULT;
109111ed914bSDavid Wei 
109211ed914bSDavid Wei 		frag = &skb_shinfo(skb)->frags[i];
109311ed914bSDavid Wei 		end = start + skb_frag_size(frag);
109411ed914bSDavid Wei 
109511ed914bSDavid Wei 		if (offset < end) {
109611ed914bSDavid Wei 			copy = end - offset;
109711ed914bSDavid Wei 			if (copy > len)
109811ed914bSDavid Wei 				copy = len;
109911ed914bSDavid Wei 
110011ed914bSDavid Wei 			off = offset - start;
110111ed914bSDavid Wei 			ret = io_zcrx_recv_frag(req, ifq, frag, off, copy);
110211ed914bSDavid Wei 			if (ret < 0)
110311ed914bSDavid Wei 				goto out;
110411ed914bSDavid Wei 
110511ed914bSDavid Wei 			offset += ret;
110611ed914bSDavid Wei 			len -= ret;
110711ed914bSDavid Wei 			if (len == 0 || ret != copy)
110811ed914bSDavid Wei 				goto out;
110911ed914bSDavid Wei 		}
111011ed914bSDavid Wei 		start = end;
111111ed914bSDavid Wei 	}
111211ed914bSDavid Wei 
111311ed914bSDavid Wei 	skb_walk_frags(skb, frag_iter) {
111411ed914bSDavid Wei 		if (WARN_ON(start > offset + len))
111511ed914bSDavid Wei 			return -EFAULT;
111611ed914bSDavid Wei 
111711ed914bSDavid Wei 		end = start + frag_iter->len;
111811ed914bSDavid Wei 		if (offset < end) {
111911ed914bSDavid Wei 			copy = end - offset;
112011ed914bSDavid Wei 			if (copy > len)
112111ed914bSDavid Wei 				copy = len;
112211ed914bSDavid Wei 
112311ed914bSDavid Wei 			off = offset - start;
112411ed914bSDavid Wei 			ret = io_zcrx_recv_skb(desc, frag_iter, off, copy);
112511ed914bSDavid Wei 			if (ret < 0)
112611ed914bSDavid Wei 				goto out;
112711ed914bSDavid Wei 
112811ed914bSDavid Wei 			offset += ret;
112911ed914bSDavid Wei 			len -= ret;
113011ed914bSDavid Wei 			if (len == 0 || ret != copy)
113111ed914bSDavid Wei 				goto out;
113211ed914bSDavid Wei 		}
113311ed914bSDavid Wei 		start = end;
113411ed914bSDavid Wei 	}
113511ed914bSDavid Wei 
113611ed914bSDavid Wei out:
113711ed914bSDavid Wei 	if (offset == start_off)
113811ed914bSDavid Wei 		return ret;
11396699ec9aSDavid Wei 	desc->count -= (offset - start_off);
114011ed914bSDavid Wei 	return offset - start_off;
114111ed914bSDavid Wei }
114211ed914bSDavid Wei 
io_zcrx_tcp_recvmsg(struct io_kiocb * req,struct io_zcrx_ifq * ifq,struct sock * sk,int flags,unsigned issue_flags,unsigned int * outlen)114311ed914bSDavid Wei static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
114411ed914bSDavid Wei 				struct sock *sk, int flags,
11456699ec9aSDavid Wei 				unsigned issue_flags, unsigned int *outlen)
114611ed914bSDavid Wei {
11476699ec9aSDavid Wei 	unsigned int len = *outlen;
114811ed914bSDavid Wei 	struct io_zcrx_args args = {
114911ed914bSDavid Wei 		.req = req,
115011ed914bSDavid Wei 		.ifq = ifq,
115111ed914bSDavid Wei 		.sock = sk->sk_socket,
115211ed914bSDavid Wei 	};
115311ed914bSDavid Wei 	read_descriptor_t rd_desc = {
11546699ec9aSDavid Wei 		.count = len ? len : UINT_MAX,
115511ed914bSDavid Wei 		.arg.data = &args,
115611ed914bSDavid Wei 	};
115711ed914bSDavid Wei 	int ret;
115811ed914bSDavid Wei 
115911ed914bSDavid Wei 	lock_sock(sk);
116011ed914bSDavid Wei 	ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb);
11616699ec9aSDavid Wei 	if (len && ret > 0)
11626699ec9aSDavid Wei 		*outlen = len - ret;
116311ed914bSDavid Wei 	if (ret <= 0) {
116411ed914bSDavid Wei 		if (ret < 0 || sock_flag(sk, SOCK_DONE))
116511ed914bSDavid Wei 			goto out;
116611ed914bSDavid Wei 		if (sk->sk_err)
116711ed914bSDavid Wei 			ret = sock_error(sk);
116811ed914bSDavid Wei 		else if (sk->sk_shutdown & RCV_SHUTDOWN)
116911ed914bSDavid Wei 			goto out;
117011ed914bSDavid Wei 		else if (sk->sk_state == TCP_CLOSE)
117111ed914bSDavid Wei 			ret = -ENOTCONN;
117211ed914bSDavid Wei 		else
117311ed914bSDavid Wei 			ret = -EAGAIN;
1174931dfae1SPavel Begunkov 	} else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) &&
1175931dfae1SPavel Begunkov 		   (issue_flags & IO_URING_F_MULTISHOT)) {
1176931dfae1SPavel Begunkov 		ret = IOU_REQUEUE;
117711ed914bSDavid Wei 	} else if (sock_flag(sk, SOCK_DONE)) {
117811ed914bSDavid Wei 		/* Make it to retry until it finally gets 0. */
117911ed914bSDavid Wei 		if (issue_flags & IO_URING_F_MULTISHOT)
118011ed914bSDavid Wei 			ret = IOU_REQUEUE;
118111ed914bSDavid Wei 		else
118211ed914bSDavid Wei 			ret = -EAGAIN;
118311ed914bSDavid Wei 	}
118411ed914bSDavid Wei out:
118511ed914bSDavid Wei 	release_sock(sk);
118611ed914bSDavid Wei 	return ret;
118711ed914bSDavid Wei }
118811ed914bSDavid Wei 
io_zcrx_recv(struct io_kiocb * req,struct io_zcrx_ifq * ifq,struct socket * sock,unsigned int flags,unsigned issue_flags,unsigned int * len)118911ed914bSDavid Wei int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
119011ed914bSDavid Wei 		 struct socket *sock, unsigned int flags,
11916699ec9aSDavid Wei 		 unsigned issue_flags, unsigned int *len)
119211ed914bSDavid Wei {
119311ed914bSDavid Wei 	struct sock *sk = sock->sk;
119411ed914bSDavid Wei 	const struct proto *prot = READ_ONCE(sk->sk_prot);
119511ed914bSDavid Wei 
119611ed914bSDavid Wei 	if (prot->recvmsg != tcp_recvmsg)
119711ed914bSDavid Wei 		return -EPROTONOSUPPORT;
119811ed914bSDavid Wei 
119911ed914bSDavid Wei 	sock_rps_record_flow(sk);
12006699ec9aSDavid Wei 	return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len);
120111ed914bSDavid Wei }
1202