xref: /linux/io_uring/zcrx.c (revision 3f1c07fc21c68bd3bd2df9d2c9441f6485e934d9)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/dma-map-ops.h>
5 #include <linux/mm.h>
6 #include <linux/nospec.h>
7 #include <linux/io_uring.h>
8 #include <linux/netdevice.h>
9 #include <linux/rtnetlink.h>
10 #include <linux/skbuff_ref.h>
11 #include <linux/anon_inodes.h>
12 
13 #include <net/page_pool/helpers.h>
14 #include <net/page_pool/memory_provider.h>
15 #include <net/netlink.h>
16 #include <net/netdev_queues.h>
17 #include <net/netdev_rx_queue.h>
18 #include <net/tcp.h>
19 #include <net/rps.h>
20 
21 #include <trace/events/page_pool.h>
22 
23 #include <uapi/linux/io_uring.h>
24 
25 #include "io_uring.h"
26 #include "kbuf.h"
27 #include "memmap.h"
28 #include "zcrx.h"
29 #include "rsrc.h"
30 
31 #define IO_ZCRX_AREA_SUPPORTED_FLAGS	(IORING_ZCRX_AREA_DMABUF)
32 
33 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
34 
io_pp_to_ifq(struct page_pool * pp)35 static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
36 {
37 	return pp->mp_priv;
38 }
39 
io_zcrx_iov_to_area(const struct net_iov * niov)40 static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
41 {
42 	struct net_iov_area *owner = net_iov_owner(niov);
43 
44 	return container_of(owner, struct io_zcrx_area, nia);
45 }
46 
io_zcrx_iov_page(const struct net_iov * niov)47 static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
48 {
49 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
50 	unsigned niov_pages_shift;
51 
52 	lockdep_assert(!area->mem.is_dmabuf);
53 
54 	niov_pages_shift = area->ifq->niov_shift - PAGE_SHIFT;
55 	return area->mem.pages[net_iov_idx(niov) << niov_pages_shift];
56 }
57 
io_populate_area_dma(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)58 static int io_populate_area_dma(struct io_zcrx_ifq *ifq,
59 				struct io_zcrx_area *area)
60 {
61 	unsigned niov_size = 1U << ifq->niov_shift;
62 	struct sg_table *sgt = area->mem.sgt;
63 	struct scatterlist *sg;
64 	unsigned i, niov_idx = 0;
65 
66 	for_each_sgtable_dma_sg(sgt, sg, i) {
67 		dma_addr_t dma = sg_dma_address(sg);
68 		unsigned long sg_len = sg_dma_len(sg);
69 
70 		if (WARN_ON_ONCE(sg_len % niov_size))
71 			return -EINVAL;
72 
73 		while (sg_len && niov_idx < area->nia.num_niovs) {
74 			struct net_iov *niov = &area->nia.niovs[niov_idx];
75 
76 			if (net_mp_niov_set_dma_addr(niov, dma))
77 				return -EFAULT;
78 			sg_len -= niov_size;
79 			dma += niov_size;
80 			niov_idx++;
81 		}
82 	}
83 
84 	if (WARN_ON_ONCE(niov_idx != area->nia.num_niovs))
85 		return -EFAULT;
86 	return 0;
87 }
88 
io_release_dmabuf(struct io_zcrx_mem * mem)89 static void io_release_dmabuf(struct io_zcrx_mem *mem)
90 {
91 	if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
92 		return;
93 
94 	if (mem->sgt)
95 		dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt,
96 						  DMA_FROM_DEVICE);
97 	if (mem->attach)
98 		dma_buf_detach(mem->dmabuf, mem->attach);
99 	if (mem->dmabuf)
100 		dma_buf_put(mem->dmabuf);
101 
102 	mem->sgt = NULL;
103 	mem->attach = NULL;
104 	mem->dmabuf = NULL;
105 }
106 
io_import_dmabuf(struct io_zcrx_ifq * ifq,struct io_zcrx_mem * mem,struct io_uring_zcrx_area_reg * area_reg)107 static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
108 			    struct io_zcrx_mem *mem,
109 			    struct io_uring_zcrx_area_reg *area_reg)
110 {
111 	unsigned long off = (unsigned long)area_reg->addr;
112 	unsigned long len = (unsigned long)area_reg->len;
113 	unsigned long total_size = 0;
114 	struct scatterlist *sg;
115 	int dmabuf_fd = area_reg->dmabuf_fd;
116 	int i, ret;
117 
118 	if (off)
119 		return -EINVAL;
120 	if (WARN_ON_ONCE(!ifq->dev))
121 		return -EFAULT;
122 	if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
123 		return -EINVAL;
124 
125 	mem->is_dmabuf = true;
126 	mem->dmabuf = dma_buf_get(dmabuf_fd);
127 	if (IS_ERR(mem->dmabuf)) {
128 		ret = PTR_ERR(mem->dmabuf);
129 		mem->dmabuf = NULL;
130 		goto err;
131 	}
132 
133 	mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev);
134 	if (IS_ERR(mem->attach)) {
135 		ret = PTR_ERR(mem->attach);
136 		mem->attach = NULL;
137 		goto err;
138 	}
139 
140 	mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE);
141 	if (IS_ERR(mem->sgt)) {
142 		ret = PTR_ERR(mem->sgt);
143 		mem->sgt = NULL;
144 		goto err;
145 	}
146 
147 	for_each_sgtable_dma_sg(mem->sgt, sg, i)
148 		total_size += sg_dma_len(sg);
149 
150 	if (total_size != len) {
151 		ret = -EINVAL;
152 		goto err;
153 	}
154 
155 	mem->size = len;
156 	return 0;
157 err:
158 	io_release_dmabuf(mem);
159 	return ret;
160 }
161 
io_count_account_pages(struct page ** pages,unsigned nr_pages)162 static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages)
163 {
164 	struct folio *last_folio = NULL;
165 	unsigned long res = 0;
166 	int i;
167 
168 	for (i = 0; i < nr_pages; i++) {
169 		struct folio *folio = page_folio(pages[i]);
170 
171 		if (folio == last_folio)
172 			continue;
173 		last_folio = folio;
174 		res += folio_nr_pages(folio);
175 	}
176 	return res;
177 }
178 
io_import_umem(struct io_zcrx_ifq * ifq,struct io_zcrx_mem * mem,struct io_uring_zcrx_area_reg * area_reg)179 static int io_import_umem(struct io_zcrx_ifq *ifq,
180 			  struct io_zcrx_mem *mem,
181 			  struct io_uring_zcrx_area_reg *area_reg)
182 {
183 	struct page **pages;
184 	int nr_pages, ret;
185 
186 	if (area_reg->dmabuf_fd)
187 		return -EINVAL;
188 	if (!area_reg->addr)
189 		return -EFAULT;
190 	pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
191 				   &nr_pages);
192 	if (IS_ERR(pages))
193 		return PTR_ERR(pages);
194 
195 	ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages,
196 					0, nr_pages << PAGE_SHIFT,
197 					GFP_KERNEL_ACCOUNT);
198 	if (ret) {
199 		unpin_user_pages(pages, nr_pages);
200 		return ret;
201 	}
202 
203 	mem->account_pages = io_count_account_pages(pages, nr_pages);
204 	ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages);
205 	if (ret < 0)
206 		mem->account_pages = 0;
207 
208 	mem->sgt = &mem->page_sg_table;
209 	mem->pages = pages;
210 	mem->nr_folios = nr_pages;
211 	mem->size = area_reg->len;
212 	return ret;
213 }
214 
io_release_area_mem(struct io_zcrx_mem * mem)215 static void io_release_area_mem(struct io_zcrx_mem *mem)
216 {
217 	if (mem->is_dmabuf) {
218 		io_release_dmabuf(mem);
219 		return;
220 	}
221 	if (mem->pages) {
222 		unpin_user_pages(mem->pages, mem->nr_folios);
223 		sg_free_table(mem->sgt);
224 		mem->sgt = NULL;
225 		kvfree(mem->pages);
226 	}
227 }
228 
io_import_area(struct io_zcrx_ifq * ifq,struct io_zcrx_mem * mem,struct io_uring_zcrx_area_reg * area_reg)229 static int io_import_area(struct io_zcrx_ifq *ifq,
230 			  struct io_zcrx_mem *mem,
231 			  struct io_uring_zcrx_area_reg *area_reg)
232 {
233 	int ret;
234 
235 	if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS)
236 		return -EINVAL;
237 	if (area_reg->rq_area_token)
238 		return -EINVAL;
239 	if (area_reg->__resv2[0] || area_reg->__resv2[1])
240 		return -EINVAL;
241 
242 	ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
243 	if (ret)
244 		return ret;
245 	if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
246 		return -EINVAL;
247 
248 	if (area_reg->flags & IORING_ZCRX_AREA_DMABUF)
249 		return io_import_dmabuf(ifq, mem, area_reg);
250 	return io_import_umem(ifq, mem, area_reg);
251 }
252 
io_zcrx_unmap_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)253 static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
254 				struct io_zcrx_area *area)
255 {
256 	int i;
257 
258 	guard(mutex)(&ifq->pp_lock);
259 	if (!area->is_mapped)
260 		return;
261 	area->is_mapped = false;
262 
263 	for (i = 0; i < area->nia.num_niovs; i++)
264 		net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
265 
266 	if (area->mem.is_dmabuf) {
267 		io_release_dmabuf(&area->mem);
268 	} else {
269 		dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table,
270 				  DMA_FROM_DEVICE, IO_DMA_ATTR);
271 	}
272 }
273 
io_zcrx_map_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)274 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
275 {
276 	int ret;
277 
278 	guard(mutex)(&ifq->pp_lock);
279 	if (area->is_mapped)
280 		return 0;
281 
282 	if (!area->mem.is_dmabuf) {
283 		ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table,
284 				      DMA_FROM_DEVICE, IO_DMA_ATTR);
285 		if (ret < 0)
286 			return ret;
287 	}
288 
289 	ret = io_populate_area_dma(ifq, area);
290 	if (ret == 0)
291 		area->is_mapped = true;
292 	return ret;
293 }
294 
io_zcrx_sync_for_device(struct page_pool * pool,struct net_iov * niov)295 static void io_zcrx_sync_for_device(struct page_pool *pool,
296 				    struct net_iov *niov)
297 {
298 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
299 	dma_addr_t dma_addr;
300 
301 	unsigned niov_size;
302 
303 	if (!dma_dev_need_sync(pool->p.dev))
304 		return;
305 
306 	niov_size = 1U << io_pp_to_ifq(pool)->niov_shift;
307 	dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
308 	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
309 				     niov_size, pool->p.dma_dir);
310 #endif
311 }
312 
313 #define IO_RQ_MAX_ENTRIES		32768
314 
315 #define IO_SKBS_PER_CALL_LIMIT	20
316 
317 struct io_zcrx_args {
318 	struct io_kiocb		*req;
319 	struct io_zcrx_ifq	*ifq;
320 	struct socket		*sock;
321 	unsigned		nr_skbs;
322 };
323 
324 static const struct memory_provider_ops io_uring_pp_zc_ops;
325 
io_get_user_counter(struct net_iov * niov)326 static inline atomic_t *io_get_user_counter(struct net_iov *niov)
327 {
328 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
329 
330 	return &area->user_refs[net_iov_idx(niov)];
331 }
332 
io_zcrx_put_niov_uref(struct net_iov * niov)333 static bool io_zcrx_put_niov_uref(struct net_iov *niov)
334 {
335 	atomic_t *uref = io_get_user_counter(niov);
336 
337 	if (unlikely(!atomic_read(uref)))
338 		return false;
339 	atomic_dec(uref);
340 	return true;
341 }
342 
io_zcrx_get_niov_uref(struct net_iov * niov)343 static void io_zcrx_get_niov_uref(struct net_iov *niov)
344 {
345 	atomic_inc(io_get_user_counter(niov));
346 }
347 
io_fill_zcrx_offsets(struct io_uring_zcrx_offsets * offsets)348 static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets)
349 {
350 	offsets->head = offsetof(struct io_uring, head);
351 	offsets->tail = offsetof(struct io_uring, tail);
352 	offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES);
353 }
354 
io_allocate_rbuf_ring(struct io_ring_ctx * ctx,struct io_zcrx_ifq * ifq,struct io_uring_zcrx_ifq_reg * reg,struct io_uring_region_desc * rd,u32 id)355 static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
356 				 struct io_zcrx_ifq *ifq,
357 				 struct io_uring_zcrx_ifq_reg *reg,
358 				 struct io_uring_region_desc *rd,
359 				 u32 id)
360 {
361 	u64 mmap_offset;
362 	size_t off, size;
363 	void *ptr;
364 	int ret;
365 
366 	io_fill_zcrx_offsets(&reg->offsets);
367 	off = reg->offsets.rqes;
368 	size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
369 	if (size > rd->size)
370 		return -EINVAL;
371 
372 	mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
373 	mmap_offset += id << IORING_OFF_PBUF_SHIFT;
374 
375 	ret = io_create_region(ctx, &ifq->region, rd, mmap_offset);
376 	if (ret < 0)
377 		return ret;
378 
379 	ptr = io_region_get_ptr(&ifq->region);
380 	ifq->rq_ring = (struct io_uring *)ptr;
381 	ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
382 
383 	return 0;
384 }
385 
io_free_rbuf_ring(struct io_zcrx_ifq * ifq)386 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
387 {
388 	io_free_region(ifq->user, &ifq->region);
389 	ifq->rq_ring = NULL;
390 	ifq->rqes = NULL;
391 }
392 
io_zcrx_free_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)393 static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
394 			      struct io_zcrx_area *area)
395 {
396 	io_zcrx_unmap_area(ifq, area);
397 	io_release_area_mem(&area->mem);
398 
399 	if (area->mem.account_pages)
400 		io_unaccount_mem(ifq->user, ifq->mm_account,
401 				 area->mem.account_pages);
402 
403 	kvfree(area->freelist);
404 	kvfree(area->nia.niovs);
405 	kvfree(area->user_refs);
406 	kfree(area);
407 }
408 
io_zcrx_append_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)409 static int io_zcrx_append_area(struct io_zcrx_ifq *ifq,
410 				struct io_zcrx_area *area)
411 {
412 	if (ifq->area)
413 		return -EINVAL;
414 	ifq->area = area;
415 	return 0;
416 }
417 
io_zcrx_create_area(struct io_zcrx_ifq * ifq,struct io_uring_zcrx_area_reg * area_reg)418 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
419 			       struct io_uring_zcrx_area_reg *area_reg)
420 {
421 	struct io_zcrx_area *area;
422 	unsigned nr_iovs;
423 	int i, ret;
424 
425 	ret = -ENOMEM;
426 	area = kzalloc(sizeof(*area), GFP_KERNEL);
427 	if (!area)
428 		goto err;
429 	area->ifq = ifq;
430 
431 	ret = io_import_area(ifq, &area->mem, area_reg);
432 	if (ret)
433 		goto err;
434 
435 	ifq->niov_shift = PAGE_SHIFT;
436 	nr_iovs = area->mem.size >> ifq->niov_shift;
437 	area->nia.num_niovs = nr_iovs;
438 
439 	ret = -ENOMEM;
440 	area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
441 					 GFP_KERNEL_ACCOUNT | __GFP_ZERO);
442 	if (!area->nia.niovs)
443 		goto err;
444 
445 	area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]),
446 					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
447 	if (!area->freelist)
448 		goto err;
449 
450 	area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]),
451 					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
452 	if (!area->user_refs)
453 		goto err;
454 
455 	for (i = 0; i < nr_iovs; i++) {
456 		struct net_iov *niov = &area->nia.niovs[i];
457 
458 		niov->owner = &area->nia;
459 		area->freelist[i] = i;
460 		atomic_set(&area->user_refs[i], 0);
461 		niov->type = NET_IOV_IOURING;
462 	}
463 
464 	area->free_count = nr_iovs;
465 	/* we're only supporting one area per ifq for now */
466 	area->area_id = 0;
467 	area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
468 	spin_lock_init(&area->freelist_lock);
469 
470 	ret = io_zcrx_append_area(ifq, area);
471 	if (!ret)
472 		return 0;
473 err:
474 	if (area)
475 		io_zcrx_free_area(ifq, area);
476 	return ret;
477 }
478 
io_zcrx_ifq_alloc(struct io_ring_ctx * ctx)479 static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
480 {
481 	struct io_zcrx_ifq *ifq;
482 
483 	ifq = kzalloc(sizeof(*ifq), GFP_KERNEL);
484 	if (!ifq)
485 		return NULL;
486 
487 	ifq->if_rxq = -1;
488 	spin_lock_init(&ifq->rq_lock);
489 	mutex_init(&ifq->pp_lock);
490 	refcount_set(&ifq->refs, 1);
491 	refcount_set(&ifq->user_refs, 1);
492 	return ifq;
493 }
494 
io_zcrx_drop_netdev(struct io_zcrx_ifq * ifq)495 static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq)
496 {
497 	guard(mutex)(&ifq->pp_lock);
498 
499 	if (!ifq->netdev)
500 		return;
501 	netdev_put(ifq->netdev, &ifq->netdev_tracker);
502 	ifq->netdev = NULL;
503 }
504 
io_close_queue(struct io_zcrx_ifq * ifq)505 static void io_close_queue(struct io_zcrx_ifq *ifq)
506 {
507 	struct net_device *netdev;
508 	netdevice_tracker netdev_tracker;
509 	struct pp_memory_provider_params p = {
510 		.mp_ops = &io_uring_pp_zc_ops,
511 		.mp_priv = ifq,
512 	};
513 
514 	if (ifq->if_rxq == -1)
515 		return;
516 
517 	scoped_guard(mutex, &ifq->pp_lock) {
518 		netdev = ifq->netdev;
519 		netdev_tracker = ifq->netdev_tracker;
520 		ifq->netdev = NULL;
521 	}
522 
523 	if (netdev) {
524 		net_mp_close_rxq(netdev, ifq->if_rxq, &p);
525 		netdev_put(netdev, &netdev_tracker);
526 	}
527 	ifq->if_rxq = -1;
528 }
529 
io_zcrx_ifq_free(struct io_zcrx_ifq * ifq)530 static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
531 {
532 	io_close_queue(ifq);
533 
534 	if (ifq->area)
535 		io_zcrx_free_area(ifq, ifq->area);
536 	free_uid(ifq->user);
537 	if (ifq->mm_account)
538 		mmdrop(ifq->mm_account);
539 	if (ifq->dev)
540 		put_device(ifq->dev);
541 
542 	io_free_rbuf_ring(ifq);
543 	mutex_destroy(&ifq->pp_lock);
544 	kfree(ifq);
545 }
546 
io_put_zcrx_ifq(struct io_zcrx_ifq * ifq)547 static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq)
548 {
549 	if (refcount_dec_and_test(&ifq->refs))
550 		io_zcrx_ifq_free(ifq);
551 }
552 
io_zcrx_return_niov_freelist(struct net_iov * niov)553 static void io_zcrx_return_niov_freelist(struct net_iov *niov)
554 {
555 	struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
556 
557 	spin_lock_bh(&area->freelist_lock);
558 	area->freelist[area->free_count++] = net_iov_idx(niov);
559 	spin_unlock_bh(&area->freelist_lock);
560 }
561 
io_zcrx_return_niov(struct net_iov * niov)562 static void io_zcrx_return_niov(struct net_iov *niov)
563 {
564 	netmem_ref netmem = net_iov_to_netmem(niov);
565 
566 	if (!niov->desc.pp) {
567 		/* copy fallback allocated niovs */
568 		io_zcrx_return_niov_freelist(niov);
569 		return;
570 	}
571 	page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false);
572 }
573 
io_zcrx_scrub(struct io_zcrx_ifq * ifq)574 static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
575 {
576 	struct io_zcrx_area *area = ifq->area;
577 	int i;
578 
579 	if (!area)
580 		return;
581 
582 	/* Reclaim back all buffers given to the user space. */
583 	for (i = 0; i < area->nia.num_niovs; i++) {
584 		struct net_iov *niov = &area->nia.niovs[i];
585 		int nr;
586 
587 		if (!atomic_read(io_get_user_counter(niov)))
588 			continue;
589 		nr = atomic_xchg(io_get_user_counter(niov), 0);
590 		if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
591 			io_zcrx_return_niov(niov);
592 	}
593 }
594 
zcrx_unregister(struct io_zcrx_ifq * ifq)595 static void zcrx_unregister(struct io_zcrx_ifq *ifq)
596 {
597 	if (refcount_dec_and_test(&ifq->user_refs)) {
598 		io_close_queue(ifq);
599 		io_zcrx_scrub(ifq);
600 	}
601 	io_put_zcrx_ifq(ifq);
602 }
603 
io_zcrx_get_region(struct io_ring_ctx * ctx,unsigned int id)604 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
605 					    unsigned int id)
606 {
607 	struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id);
608 
609 	lockdep_assert_held(&ctx->mmap_lock);
610 
611 	return ifq ? &ifq->region : NULL;
612 }
613 
zcrx_box_release(struct inode * inode,struct file * file)614 static int zcrx_box_release(struct inode *inode, struct file *file)
615 {
616 	struct io_zcrx_ifq *ifq = file->private_data;
617 
618 	if (WARN_ON_ONCE(!ifq))
619 		return -EFAULT;
620 	zcrx_unregister(ifq);
621 	return 0;
622 }
623 
624 static const struct file_operations zcrx_box_fops = {
625 	.owner		= THIS_MODULE,
626 	.release	= zcrx_box_release,
627 };
628 
zcrx_export(struct io_ring_ctx * ctx,struct io_zcrx_ifq * ifq,struct zcrx_ctrl * ctrl,void __user * arg)629 static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
630 		       struct zcrx_ctrl *ctrl, void __user *arg)
631 {
632 	struct zcrx_ctrl_export *ce = &ctrl->zc_export;
633 	struct file *file;
634 	int fd = -1;
635 
636 	if (!mem_is_zero(ce, sizeof(*ce)))
637 		return -EINVAL;
638 	fd = get_unused_fd_flags(O_CLOEXEC);
639 	if (fd < 0)
640 		return fd;
641 
642 	ce->zcrx_fd = fd;
643 	if (copy_to_user(arg, ctrl, sizeof(*ctrl))) {
644 		put_unused_fd(fd);
645 		return -EFAULT;
646 	}
647 
648 	refcount_inc(&ifq->refs);
649 	refcount_inc(&ifq->user_refs);
650 
651 	file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops,
652 					 ifq, O_CLOEXEC, NULL);
653 	if (IS_ERR(file)) {
654 		put_unused_fd(fd);
655 		zcrx_unregister(ifq);
656 		return PTR_ERR(file);
657 	}
658 
659 	fd_install(fd, file);
660 	return 0;
661 }
662 
import_zcrx(struct io_ring_ctx * ctx,struct io_uring_zcrx_ifq_reg __user * arg,struct io_uring_zcrx_ifq_reg * reg)663 static int import_zcrx(struct io_ring_ctx *ctx,
664 		       struct io_uring_zcrx_ifq_reg __user *arg,
665 		       struct io_uring_zcrx_ifq_reg *reg)
666 {
667 	struct io_zcrx_ifq *ifq;
668 	struct file *file;
669 	int fd, ret;
670 	u32 id;
671 
672 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
673 		return -EINVAL;
674 	if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
675 		return -EINVAL;
676 	if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr)
677 		return -EINVAL;
678 
679 	fd = reg->if_idx;
680 	CLASS(fd, f)(fd);
681 	if (fd_empty(f))
682 		return -EBADF;
683 
684 	file = fd_file(f);
685 	if (file->f_op != &zcrx_box_fops || !file->private_data)
686 		return -EBADF;
687 
688 	ifq = file->private_data;
689 	refcount_inc(&ifq->refs);
690 	refcount_inc(&ifq->user_refs);
691 
692 	scoped_guard(mutex, &ctx->mmap_lock) {
693 		ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
694 		if (ret)
695 			goto err;
696 	}
697 
698 	reg->zcrx_id = id;
699 	io_fill_zcrx_offsets(&reg->offsets);
700 	if (copy_to_user(arg, reg, sizeof(*reg))) {
701 		ret = -EFAULT;
702 		goto err_xa_erase;
703 	}
704 
705 	scoped_guard(mutex, &ctx->mmap_lock) {
706 		ret = -ENOMEM;
707 		if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
708 			goto err_xa_erase;
709 	}
710 
711 	return 0;
712 err_xa_erase:
713 	scoped_guard(mutex, &ctx->mmap_lock)
714 		xa_erase(&ctx->zcrx_ctxs, id);
715 err:
716 	zcrx_unregister(ifq);
717 	return ret;
718 }
719 
io_register_zcrx_ifq(struct io_ring_ctx * ctx,struct io_uring_zcrx_ifq_reg __user * arg)720 int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
721 			  struct io_uring_zcrx_ifq_reg __user *arg)
722 {
723 	struct pp_memory_provider_params mp_param = {};
724 	struct io_uring_zcrx_area_reg area;
725 	struct io_uring_zcrx_ifq_reg reg;
726 	struct io_uring_region_desc rd;
727 	struct io_zcrx_ifq *ifq;
728 	int ret;
729 	u32 id;
730 
731 	/*
732 	 * 1. Interface queue allocation.
733 	 * 2. It can observe data destined for sockets of other tasks.
734 	 */
735 	if (!capable(CAP_NET_ADMIN))
736 		return -EPERM;
737 
738 	/* mandatory io_uring features for zc rx */
739 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
740 		return -EINVAL;
741 	if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
742 		return -EINVAL;
743 	if (copy_from_user(&reg, arg, sizeof(reg)))
744 		return -EFAULT;
745 	if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) ||
746 	    reg.__resv2 || reg.zcrx_id)
747 		return -EINVAL;
748 	if (reg.flags & ZCRX_REG_IMPORT)
749 		return import_zcrx(ctx, arg, &reg);
750 	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
751 		return -EFAULT;
752 	if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
753 		return -EINVAL;
754 	if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
755 		if (!(ctx->flags & IORING_SETUP_CLAMP))
756 			return -EINVAL;
757 		reg.rq_entries = IO_RQ_MAX_ENTRIES;
758 	}
759 	reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
760 
761 	if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area)))
762 		return -EFAULT;
763 
764 	ifq = io_zcrx_ifq_alloc(ctx);
765 	if (!ifq)
766 		return -ENOMEM;
767 
768 	if (ctx->user) {
769 		get_uid(ctx->user);
770 		ifq->user = ctx->user;
771 	}
772 	if (ctx->mm_account) {
773 		mmgrab(ctx->mm_account);
774 		ifq->mm_account = ctx->mm_account;
775 	}
776 	ifq->rq_entries = reg.rq_entries;
777 
778 	scoped_guard(mutex, &ctx->mmap_lock) {
779 		/* preallocate id */
780 		ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
781 		if (ret)
782 			goto ifq_free;
783 	}
784 
785 	ret = io_allocate_rbuf_ring(ctx, ifq, &reg, &rd, id);
786 	if (ret)
787 		goto err;
788 
789 	ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, reg.if_idx);
790 	if (!ifq->netdev) {
791 		ret = -ENODEV;
792 		goto err;
793 	}
794 	netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL);
795 
796 	ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq);
797 	if (!ifq->dev) {
798 		ret = -EOPNOTSUPP;
799 		goto netdev_put_unlock;
800 	}
801 	get_device(ifq->dev);
802 
803 	ret = io_zcrx_create_area(ifq, &area);
804 	if (ret)
805 		goto netdev_put_unlock;
806 
807 	mp_param.mp_ops = &io_uring_pp_zc_ops;
808 	mp_param.mp_priv = ifq;
809 	ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL);
810 	if (ret)
811 		goto netdev_put_unlock;
812 	netdev_unlock(ifq->netdev);
813 	ifq->if_rxq = reg.if_rxq;
814 
815 	reg.zcrx_id = id;
816 
817 	scoped_guard(mutex, &ctx->mmap_lock) {
818 		/* publish ifq */
819 		ret = -ENOMEM;
820 		if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
821 			goto err;
822 	}
823 
824 	if (copy_to_user(arg, &reg, sizeof(reg)) ||
825 	    copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
826 	    copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) {
827 		ret = -EFAULT;
828 		goto err;
829 	}
830 	return 0;
831 netdev_put_unlock:
832 	netdev_put(ifq->netdev, &ifq->netdev_tracker);
833 	netdev_unlock(ifq->netdev);
834 err:
835 	scoped_guard(mutex, &ctx->mmap_lock)
836 		xa_erase(&ctx->zcrx_ctxs, id);
837 ifq_free:
838 	io_zcrx_ifq_free(ifq);
839 	return ret;
840 }
841 
__io_zcrx_get_free_niov(struct io_zcrx_area * area)842 static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
843 {
844 	unsigned niov_idx;
845 
846 	lockdep_assert_held(&area->freelist_lock);
847 
848 	niov_idx = area->freelist[--area->free_count];
849 	return &area->nia.niovs[niov_idx];
850 }
851 
io_unregister_zcrx_ifqs(struct io_ring_ctx * ctx)852 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
853 {
854 	struct io_zcrx_ifq *ifq;
855 
856 	lockdep_assert_held(&ctx->uring_lock);
857 
858 	while (1) {
859 		scoped_guard(mutex, &ctx->mmap_lock) {
860 			unsigned long id = 0;
861 
862 			ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
863 			if (ifq)
864 				xa_erase(&ctx->zcrx_ctxs, id);
865 		}
866 		if (!ifq)
867 			break;
868 		zcrx_unregister(ifq);
869 	}
870 
871 	xa_destroy(&ctx->zcrx_ctxs);
872 }
873 
io_zcrx_rqring_entries(struct io_zcrx_ifq * ifq)874 static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
875 {
876 	u32 entries;
877 
878 	entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
879 	return min(entries, ifq->rq_entries);
880 }
881 
io_zcrx_get_rqe(struct io_zcrx_ifq * ifq,unsigned mask)882 static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
883 						 unsigned mask)
884 {
885 	unsigned int idx = ifq->cached_rq_head++ & mask;
886 
887 	return &ifq->rqes[idx];
888 }
889 
io_parse_rqe(struct io_uring_zcrx_rqe * rqe,struct io_zcrx_ifq * ifq,struct net_iov ** ret_niov)890 static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe,
891 				struct io_zcrx_ifq *ifq,
892 				struct net_iov **ret_niov)
893 {
894 	unsigned niov_idx, area_idx;
895 	struct io_zcrx_area *area;
896 
897 	area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT;
898 	niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> ifq->niov_shift;
899 
900 	if (unlikely(rqe->__pad || area_idx))
901 		return false;
902 	area = ifq->area;
903 
904 	if (unlikely(niov_idx >= area->nia.num_niovs))
905 		return false;
906 	niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
907 
908 	*ret_niov = &area->nia.niovs[niov_idx];
909 	return true;
910 }
911 
io_zcrx_ring_refill(struct page_pool * pp,struct io_zcrx_ifq * ifq)912 static void io_zcrx_ring_refill(struct page_pool *pp,
913 				struct io_zcrx_ifq *ifq)
914 {
915 	unsigned int mask = ifq->rq_entries - 1;
916 	unsigned int entries;
917 
918 	guard(spinlock_bh)(&ifq->rq_lock);
919 
920 	entries = io_zcrx_rqring_entries(ifq);
921 	entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL);
922 	if (unlikely(!entries))
923 		return;
924 
925 	do {
926 		struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
927 		struct net_iov *niov;
928 		netmem_ref netmem;
929 
930 		if (!io_parse_rqe(rqe, ifq, &niov))
931 			continue;
932 		if (!io_zcrx_put_niov_uref(niov))
933 			continue;
934 
935 		netmem = net_iov_to_netmem(niov);
936 		if (!page_pool_unref_and_test(netmem))
937 			continue;
938 
939 		if (unlikely(niov->desc.pp != pp)) {
940 			io_zcrx_return_niov(niov);
941 			continue;
942 		}
943 
944 		io_zcrx_sync_for_device(pp, niov);
945 		net_mp_netmem_place_in_cache(pp, netmem);
946 	} while (--entries);
947 
948 	smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
949 }
950 
io_zcrx_refill_slow(struct page_pool * pp,struct io_zcrx_ifq * ifq)951 static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
952 {
953 	struct io_zcrx_area *area = ifq->area;
954 
955 	spin_lock_bh(&area->freelist_lock);
956 	while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
957 		struct net_iov *niov = __io_zcrx_get_free_niov(area);
958 		netmem_ref netmem = net_iov_to_netmem(niov);
959 
960 		net_mp_niov_set_page_pool(pp, niov);
961 		io_zcrx_sync_for_device(pp, niov);
962 		net_mp_netmem_place_in_cache(pp, netmem);
963 	}
964 	spin_unlock_bh(&area->freelist_lock);
965 }
966 
io_pp_zc_alloc_netmems(struct page_pool * pp,gfp_t gfp)967 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
968 {
969 	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
970 
971 	/* pp should already be ensuring that */
972 	if (unlikely(pp->alloc.count))
973 		goto out_return;
974 
975 	io_zcrx_ring_refill(pp, ifq);
976 	if (likely(pp->alloc.count))
977 		goto out_return;
978 
979 	io_zcrx_refill_slow(pp, ifq);
980 	if (!pp->alloc.count)
981 		return 0;
982 out_return:
983 	return pp->alloc.cache[--pp->alloc.count];
984 }
985 
io_pp_zc_release_netmem(struct page_pool * pp,netmem_ref netmem)986 static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
987 {
988 	struct net_iov *niov;
989 
990 	if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
991 		return false;
992 
993 	niov = netmem_to_net_iov(netmem);
994 	net_mp_niov_clear_page_pool(niov);
995 	io_zcrx_return_niov_freelist(niov);
996 	return false;
997 }
998 
io_pp_zc_init(struct page_pool * pp)999 static int io_pp_zc_init(struct page_pool *pp)
1000 {
1001 	struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
1002 	int ret;
1003 
1004 	if (WARN_ON_ONCE(!ifq))
1005 		return -EINVAL;
1006 	if (WARN_ON_ONCE(ifq->dev != pp->p.dev))
1007 		return -EINVAL;
1008 	if (WARN_ON_ONCE(!pp->dma_map))
1009 		return -EOPNOTSUPP;
1010 	if (pp->p.order + PAGE_SHIFT != ifq->niov_shift)
1011 		return -EINVAL;
1012 	if (pp->p.dma_dir != DMA_FROM_DEVICE)
1013 		return -EOPNOTSUPP;
1014 
1015 	ret = io_zcrx_map_area(ifq, ifq->area);
1016 	if (ret)
1017 		return ret;
1018 
1019 	refcount_inc(&ifq->refs);
1020 	return 0;
1021 }
1022 
io_pp_zc_destroy(struct page_pool * pp)1023 static void io_pp_zc_destroy(struct page_pool *pp)
1024 {
1025 	io_put_zcrx_ifq(io_pp_to_ifq(pp));
1026 }
1027 
io_pp_nl_fill(void * mp_priv,struct sk_buff * rsp,struct netdev_rx_queue * rxq)1028 static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp,
1029 			 struct netdev_rx_queue *rxq)
1030 {
1031 	struct nlattr *nest;
1032 	int type;
1033 
1034 	type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING;
1035 	nest = nla_nest_start(rsp, type);
1036 	if (!nest)
1037 		return -EMSGSIZE;
1038 	nla_nest_end(rsp, nest);
1039 
1040 	return 0;
1041 }
1042 
io_pp_uninstall(void * mp_priv,struct netdev_rx_queue * rxq)1043 static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
1044 {
1045 	struct pp_memory_provider_params *p = &rxq->mp_params;
1046 	struct io_zcrx_ifq *ifq = mp_priv;
1047 
1048 	io_zcrx_drop_netdev(ifq);
1049 	if (ifq->area)
1050 		io_zcrx_unmap_area(ifq, ifq->area);
1051 
1052 	p->mp_ops = NULL;
1053 	p->mp_priv = NULL;
1054 }
1055 
1056 static const struct memory_provider_ops io_uring_pp_zc_ops = {
1057 	.alloc_netmems		= io_pp_zc_alloc_netmems,
1058 	.release_netmem		= io_pp_zc_release_netmem,
1059 	.init			= io_pp_zc_init,
1060 	.destroy		= io_pp_zc_destroy,
1061 	.nl_fill		= io_pp_nl_fill,
1062 	.uninstall		= io_pp_uninstall,
1063 };
1064 
zcrx_parse_rq(netmem_ref * netmem_array,unsigned nr,struct io_zcrx_ifq * zcrx)1065 static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr,
1066 			      struct io_zcrx_ifq *zcrx)
1067 {
1068 	unsigned int mask = zcrx->rq_entries - 1;
1069 	unsigned int i;
1070 
1071 	guard(spinlock_bh)(&zcrx->rq_lock);
1072 
1073 	nr = min(nr, io_zcrx_rqring_entries(zcrx));
1074 	for (i = 0; i < nr; i++) {
1075 		struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask);
1076 		struct net_iov *niov;
1077 
1078 		if (!io_parse_rqe(rqe, zcrx, &niov))
1079 			break;
1080 		netmem_array[i] = net_iov_to_netmem(niov);
1081 	}
1082 
1083 	smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head);
1084 	return i;
1085 }
1086 
1087 #define ZCRX_FLUSH_BATCH 32
1088 
zcrx_return_buffers(netmem_ref * netmems,unsigned nr)1089 static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr)
1090 {
1091 	unsigned i;
1092 
1093 	for (i = 0; i < nr; i++) {
1094 		netmem_ref netmem = netmems[i];
1095 		struct net_iov *niov = netmem_to_net_iov(netmem);
1096 
1097 		if (!io_zcrx_put_niov_uref(niov))
1098 			continue;
1099 		if (!page_pool_unref_and_test(netmem))
1100 			continue;
1101 		io_zcrx_return_niov(niov);
1102 	}
1103 }
1104 
zcrx_flush_rq(struct io_ring_ctx * ctx,struct io_zcrx_ifq * zcrx,struct zcrx_ctrl * ctrl)1105 static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
1106 			 struct zcrx_ctrl *ctrl)
1107 {
1108 	struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush;
1109 	netmem_ref netmems[ZCRX_FLUSH_BATCH];
1110 	unsigned total = 0;
1111 	unsigned nr;
1112 
1113 	if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv)))
1114 		return -EINVAL;
1115 
1116 	do {
1117 		nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx);
1118 
1119 		zcrx_return_buffers(netmems, nr);
1120 		total += nr;
1121 
1122 		if (fatal_signal_pending(current))
1123 			break;
1124 		cond_resched();
1125 	} while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries);
1126 
1127 	return 0;
1128 }
1129 
io_zcrx_ctrl(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)1130 int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
1131 {
1132 	struct zcrx_ctrl ctrl;
1133 	struct io_zcrx_ifq *zcrx;
1134 
1135 	if (nr_args)
1136 		return -EINVAL;
1137 	if (copy_from_user(&ctrl, arg, sizeof(ctrl)))
1138 		return -EFAULT;
1139 	if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv)))
1140 		return -EFAULT;
1141 
1142 	zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id);
1143 	if (!zcrx)
1144 		return -ENXIO;
1145 
1146 	switch (ctrl.op) {
1147 	case ZCRX_CTRL_FLUSH_RQ:
1148 		return zcrx_flush_rq(ctx, zcrx, &ctrl);
1149 	case ZCRX_CTRL_EXPORT:
1150 		return zcrx_export(ctx, zcrx, &ctrl, arg);
1151 	}
1152 
1153 	return -EOPNOTSUPP;
1154 }
1155 
io_zcrx_queue_cqe(struct io_kiocb * req,struct net_iov * niov,struct io_zcrx_ifq * ifq,int off,int len)1156 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
1157 			      struct io_zcrx_ifq *ifq, int off, int len)
1158 {
1159 	struct io_ring_ctx *ctx = req->ctx;
1160 	struct io_uring_zcrx_cqe *rcqe;
1161 	struct io_zcrx_area *area;
1162 	struct io_uring_cqe *cqe;
1163 	u64 offset;
1164 
1165 	if (!io_defer_get_uncommited_cqe(ctx, &cqe))
1166 		return false;
1167 
1168 	cqe->user_data = req->cqe.user_data;
1169 	cqe->res = len;
1170 	cqe->flags = IORING_CQE_F_MORE;
1171 	if (ctx->flags & IORING_SETUP_CQE_MIXED)
1172 		cqe->flags |= IORING_CQE_F_32;
1173 
1174 	area = io_zcrx_iov_to_area(niov);
1175 	offset = off + (net_iov_idx(niov) << ifq->niov_shift);
1176 	rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1);
1177 	rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT);
1178 	rcqe->__pad = 0;
1179 	return true;
1180 }
1181 
io_alloc_fallback_niov(struct io_zcrx_ifq * ifq)1182 static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq)
1183 {
1184 	struct io_zcrx_area *area = ifq->area;
1185 	struct net_iov *niov = NULL;
1186 
1187 	if (area->mem.is_dmabuf)
1188 		return NULL;
1189 
1190 	spin_lock_bh(&area->freelist_lock);
1191 	if (area->free_count)
1192 		niov = __io_zcrx_get_free_niov(area);
1193 	spin_unlock_bh(&area->freelist_lock);
1194 
1195 	if (niov)
1196 		page_pool_fragment_netmem(net_iov_to_netmem(niov), 1);
1197 	return niov;
1198 }
1199 
1200 struct io_copy_cache {
1201 	struct page		*page;
1202 	unsigned long		offset;
1203 	size_t			size;
1204 };
1205 
io_copy_page(struct io_copy_cache * cc,struct page * src_page,unsigned int src_offset,size_t len)1206 static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page,
1207 			    unsigned int src_offset, size_t len)
1208 {
1209 	size_t copied = 0;
1210 
1211 	len = min(len, cc->size);
1212 
1213 	while (len) {
1214 		void *src_addr, *dst_addr;
1215 		struct page *dst_page = cc->page;
1216 		unsigned dst_offset = cc->offset;
1217 		size_t n = len;
1218 
1219 		if (folio_test_partial_kmap(page_folio(dst_page)) ||
1220 		    folio_test_partial_kmap(page_folio(src_page))) {
1221 			dst_page += dst_offset / PAGE_SIZE;
1222 			dst_offset = offset_in_page(dst_offset);
1223 			src_page += src_offset / PAGE_SIZE;
1224 			src_offset = offset_in_page(src_offset);
1225 			n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset);
1226 			n = min(n, len);
1227 		}
1228 
1229 		dst_addr = kmap_local_page(dst_page) + dst_offset;
1230 		src_addr = kmap_local_page(src_page) + src_offset;
1231 
1232 		memcpy(dst_addr, src_addr, n);
1233 
1234 		kunmap_local(src_addr);
1235 		kunmap_local(dst_addr);
1236 
1237 		cc->size -= n;
1238 		cc->offset += n;
1239 		src_offset += n;
1240 		len -= n;
1241 		copied += n;
1242 	}
1243 	return copied;
1244 }
1245 
io_zcrx_copy_chunk(struct io_kiocb * req,struct io_zcrx_ifq * ifq,struct page * src_page,unsigned int src_offset,size_t len)1246 static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
1247 				  struct page *src_page, unsigned int src_offset,
1248 				  size_t len)
1249 {
1250 	size_t copied = 0;
1251 	int ret = 0;
1252 
1253 	while (len) {
1254 		struct io_copy_cache cc;
1255 		struct net_iov *niov;
1256 		size_t n;
1257 
1258 		niov = io_alloc_fallback_niov(ifq);
1259 		if (!niov) {
1260 			ret = -ENOMEM;
1261 			break;
1262 		}
1263 
1264 		cc.page = io_zcrx_iov_page(niov);
1265 		cc.offset = 0;
1266 		cc.size = PAGE_SIZE;
1267 
1268 		n = io_copy_page(&cc, src_page, src_offset, len);
1269 
1270 		if (!io_zcrx_queue_cqe(req, niov, ifq, 0, n)) {
1271 			io_zcrx_return_niov(niov);
1272 			ret = -ENOSPC;
1273 			break;
1274 		}
1275 
1276 		io_zcrx_get_niov_uref(niov);
1277 		src_offset += n;
1278 		len -= n;
1279 		copied += n;
1280 	}
1281 
1282 	return copied ? copied : ret;
1283 }
1284 
io_zcrx_copy_frag(struct io_kiocb * req,struct io_zcrx_ifq * ifq,const skb_frag_t * frag,int off,int len)1285 static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
1286 			     const skb_frag_t *frag, int off, int len)
1287 {
1288 	struct page *page = skb_frag_page(frag);
1289 
1290 	return io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len);
1291 }
1292 
io_zcrx_recv_frag(struct io_kiocb * req,struct io_zcrx_ifq * ifq,const skb_frag_t * frag,int off,int len)1293 static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
1294 			     const skb_frag_t *frag, int off, int len)
1295 {
1296 	struct net_iov *niov;
1297 	struct page_pool *pp;
1298 
1299 	if (unlikely(!skb_frag_is_net_iov(frag)))
1300 		return io_zcrx_copy_frag(req, ifq, frag, off, len);
1301 
1302 	niov = netmem_to_net_iov(frag->netmem);
1303 	pp = niov->desc.pp;
1304 
1305 	if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq)
1306 		return -EFAULT;
1307 
1308 	if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
1309 		return -ENOSPC;
1310 
1311 	/*
1312 	 * Prevent it from being recycled while user is accessing it.
1313 	 * It has to be done before grabbing a user reference.
1314 	 */
1315 	page_pool_ref_netmem(net_iov_to_netmem(niov));
1316 	io_zcrx_get_niov_uref(niov);
1317 	return len;
1318 }
1319 
1320 static int
io_zcrx_recv_skb(read_descriptor_t * desc,struct sk_buff * skb,unsigned int offset,size_t len)1321 io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
1322 		 unsigned int offset, size_t len)
1323 {
1324 	struct io_zcrx_args *args = desc->arg.data;
1325 	struct io_zcrx_ifq *ifq = args->ifq;
1326 	struct io_kiocb *req = args->req;
1327 	struct sk_buff *frag_iter;
1328 	unsigned start, start_off = offset;
1329 	int i, copy, end, off;
1330 	int ret = 0;
1331 
1332 	len = min_t(size_t, len, desc->count);
1333 	/*
1334 	 * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even
1335 	 * if desc->count is already 0. This is caused by the if (offset + 1 !=
1336 	 * skb->len) check. Return early in this case to break out of
1337 	 * __tcp_read_sock().
1338 	 */
1339 	if (!len)
1340 		return 0;
1341 	if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT))
1342 		return -EAGAIN;
1343 
1344 	if (unlikely(offset < skb_headlen(skb))) {
1345 		ssize_t copied;
1346 		size_t to_copy;
1347 
1348 		to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
1349 		copied = io_zcrx_copy_chunk(req, ifq, virt_to_page(skb->data),
1350 					    offset_in_page(skb->data) + offset,
1351 					    to_copy);
1352 		if (copied < 0) {
1353 			ret = copied;
1354 			goto out;
1355 		}
1356 		offset += copied;
1357 		len -= copied;
1358 		if (!len)
1359 			goto out;
1360 		if (offset != skb_headlen(skb))
1361 			goto out;
1362 	}
1363 
1364 	start = skb_headlen(skb);
1365 
1366 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1367 		const skb_frag_t *frag;
1368 
1369 		if (WARN_ON(start > offset + len))
1370 			return -EFAULT;
1371 
1372 		frag = &skb_shinfo(skb)->frags[i];
1373 		end = start + skb_frag_size(frag);
1374 
1375 		if (offset < end) {
1376 			copy = end - offset;
1377 			if (copy > len)
1378 				copy = len;
1379 
1380 			off = offset - start;
1381 			ret = io_zcrx_recv_frag(req, ifq, frag, off, copy);
1382 			if (ret < 0)
1383 				goto out;
1384 
1385 			offset += ret;
1386 			len -= ret;
1387 			if (len == 0 || ret != copy)
1388 				goto out;
1389 		}
1390 		start = end;
1391 	}
1392 
1393 	skb_walk_frags(skb, frag_iter) {
1394 		if (WARN_ON(start > offset + len))
1395 			return -EFAULT;
1396 
1397 		end = start + frag_iter->len;
1398 		if (offset < end) {
1399 			size_t count;
1400 
1401 			copy = end - offset;
1402 			if (copy > len)
1403 				copy = len;
1404 
1405 			off = offset - start;
1406 			count = desc->count;
1407 			ret = io_zcrx_recv_skb(desc, frag_iter, off, copy);
1408 			desc->count = count;
1409 			if (ret < 0)
1410 				goto out;
1411 
1412 			offset += ret;
1413 			len -= ret;
1414 			if (len == 0 || ret != copy)
1415 				goto out;
1416 		}
1417 		start = end;
1418 	}
1419 
1420 out:
1421 	if (offset == start_off)
1422 		return ret;
1423 	desc->count -= (offset - start_off);
1424 	return offset - start_off;
1425 }
1426 
io_zcrx_tcp_recvmsg(struct io_kiocb * req,struct io_zcrx_ifq * ifq,struct sock * sk,int flags,unsigned issue_flags,unsigned int * outlen)1427 static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
1428 				struct sock *sk, int flags,
1429 				unsigned issue_flags, unsigned int *outlen)
1430 {
1431 	unsigned int len = *outlen;
1432 	struct io_zcrx_args args = {
1433 		.req = req,
1434 		.ifq = ifq,
1435 		.sock = sk->sk_socket,
1436 	};
1437 	read_descriptor_t rd_desc = {
1438 		.count = len ? len : UINT_MAX,
1439 		.arg.data = &args,
1440 	};
1441 	int ret;
1442 
1443 	lock_sock(sk);
1444 	ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb);
1445 	if (len && ret > 0)
1446 		*outlen = len - ret;
1447 	if (ret <= 0) {
1448 		if (ret < 0 || sock_flag(sk, SOCK_DONE))
1449 			goto out;
1450 		if (sk->sk_err)
1451 			ret = sock_error(sk);
1452 		else if (sk->sk_shutdown & RCV_SHUTDOWN)
1453 			goto out;
1454 		else if (sk->sk_state == TCP_CLOSE)
1455 			ret = -ENOTCONN;
1456 		else
1457 			ret = -EAGAIN;
1458 	} else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) &&
1459 		   (issue_flags & IO_URING_F_MULTISHOT)) {
1460 		ret = IOU_REQUEUE;
1461 	} else if (sock_flag(sk, SOCK_DONE)) {
1462 		/* Make it to retry until it finally gets 0. */
1463 		if (issue_flags & IO_URING_F_MULTISHOT)
1464 			ret = IOU_REQUEUE;
1465 		else
1466 			ret = -EAGAIN;
1467 	}
1468 out:
1469 	release_sock(sk);
1470 	return ret;
1471 }
1472 
io_zcrx_recv(struct io_kiocb * req,struct io_zcrx_ifq * ifq,struct socket * sock,unsigned int flags,unsigned issue_flags,unsigned int * len)1473 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
1474 		 struct socket *sock, unsigned int flags,
1475 		 unsigned issue_flags, unsigned int *len)
1476 {
1477 	struct sock *sk = sock->sk;
1478 	const struct proto *prot = READ_ONCE(sk->sk_prot);
1479 
1480 	if (prot->recvmsg != tcp_recvmsg)
1481 		return -EPROTONOSUPPORT;
1482 
1483 	sock_rps_record_flow(sk);
1484 	return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len);
1485 }
1486