xref: /linux/drivers/net/ethernet/google/gve/gve_rx.c (revision 223981db9bafb80f558162c148f261e2ff043dbe)
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6 
7 #include "gve.h"
8 #include "gve_adminq.h"
9 #include "gve_utils.h"
10 #include <linux/etherdevice.h>
11 #include <linux/filter.h>
12 #include <net/xdp.h>
13 #include <net/xdp_sock_drv.h>
14 
15 static void gve_rx_free_buffer(struct device *dev,
16 			       struct gve_rx_slot_page_info *page_info,
17 			       union gve_rx_data_slot *data_slot)
18 {
19 	dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) &
20 				      GVE_DATA_SLOT_ADDR_PAGE_MASK);
21 
22 	page_ref_sub(page_info->page, page_info->pagecnt_bias - 1);
23 	gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE);
24 }
25 
26 static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx)
27 {
28 	u32 slots = rx->mask + 1;
29 	int i;
30 
31 	if (rx->data.raw_addressing) {
32 		for (i = 0; i < slots; i++)
33 			gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i],
34 					   &rx->data.data_ring[i]);
35 	} else {
36 		for (i = 0; i < slots; i++)
37 			page_ref_sub(rx->data.page_info[i].page,
38 				     rx->data.page_info[i].pagecnt_bias - 1);
39 		gve_unassign_qpl(priv, rx->data.qpl->id);
40 		rx->data.qpl = NULL;
41 
42 		for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) {
43 			page_ref_sub(rx->qpl_copy_pool[i].page,
44 				     rx->qpl_copy_pool[i].pagecnt_bias - 1);
45 			put_page(rx->qpl_copy_pool[i].page);
46 		}
47 	}
48 	kvfree(rx->data.page_info);
49 	rx->data.page_info = NULL;
50 }
51 
52 static void gve_rx_free_ring(struct gve_priv *priv, int idx)
53 {
54 	struct gve_rx_ring *rx = &priv->rx[idx];
55 	struct device *dev = &priv->pdev->dev;
56 	u32 slots = rx->mask + 1;
57 	size_t bytes;
58 
59 	gve_rx_remove_from_block(priv, idx);
60 
61 	bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt;
62 	dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus);
63 	rx->desc.desc_ring = NULL;
64 
65 	dma_free_coherent(dev, sizeof(*rx->q_resources),
66 			  rx->q_resources, rx->q_resources_bus);
67 	rx->q_resources = NULL;
68 
69 	gve_rx_unfill_pages(priv, rx);
70 
71 	bytes = sizeof(*rx->data.data_ring) * slots;
72 	dma_free_coherent(dev, bytes, rx->data.data_ring,
73 			  rx->data.data_bus);
74 	rx->data.data_ring = NULL;
75 
76 	kvfree(rx->qpl_copy_pool);
77 	rx->qpl_copy_pool = NULL;
78 
79 	netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
80 }
81 
82 static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info,
83 			     dma_addr_t addr, struct page *page, __be64 *slot_addr)
84 {
85 	page_info->page = page;
86 	page_info->page_offset = 0;
87 	page_info->page_address = page_address(page);
88 	*slot_addr = cpu_to_be64(addr);
89 	/* The page already has 1 ref */
90 	page_ref_add(page, INT_MAX - 1);
91 	page_info->pagecnt_bias = INT_MAX;
92 }
93 
94 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev,
95 			       struct gve_rx_slot_page_info *page_info,
96 			       union gve_rx_data_slot *data_slot)
97 {
98 	struct page *page;
99 	dma_addr_t dma;
100 	int err;
101 
102 	err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE,
103 			     GFP_ATOMIC);
104 	if (err)
105 		return err;
106 
107 	gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr);
108 	return 0;
109 }
110 
111 static int gve_prefill_rx_pages(struct gve_rx_ring *rx)
112 {
113 	struct gve_priv *priv = rx->gve;
114 	u32 slots;
115 	int err;
116 	int i;
117 	int j;
118 
119 	/* Allocate one page per Rx queue slot. Each page is split into two
120 	 * packet buffers, when possible we "page flip" between the two.
121 	 */
122 	slots = rx->mask + 1;
123 
124 	rx->data.page_info = kvzalloc(slots *
125 				      sizeof(*rx->data.page_info), GFP_KERNEL);
126 	if (!rx->data.page_info)
127 		return -ENOMEM;
128 
129 	if (!rx->data.raw_addressing) {
130 		rx->data.qpl = gve_assign_rx_qpl(priv, rx->q_num);
131 		if (!rx->data.qpl) {
132 			kvfree(rx->data.page_info);
133 			rx->data.page_info = NULL;
134 			return -ENOMEM;
135 		}
136 	}
137 	for (i = 0; i < slots; i++) {
138 		if (!rx->data.raw_addressing) {
139 			struct page *page = rx->data.qpl->pages[i];
140 			dma_addr_t addr = i * PAGE_SIZE;
141 
142 			gve_setup_rx_buffer(&rx->data.page_info[i], addr, page,
143 					    &rx->data.data_ring[i].qpl_offset);
144 			continue;
145 		}
146 		err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, &rx->data.page_info[i],
147 					  &rx->data.data_ring[i]);
148 		if (err)
149 			goto alloc_err_rda;
150 	}
151 
152 	if (!rx->data.raw_addressing) {
153 		for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) {
154 			struct page *page = alloc_page(GFP_KERNEL);
155 
156 			if (!page) {
157 				err = -ENOMEM;
158 				goto alloc_err_qpl;
159 			}
160 
161 			rx->qpl_copy_pool[j].page = page;
162 			rx->qpl_copy_pool[j].page_offset = 0;
163 			rx->qpl_copy_pool[j].page_address = page_address(page);
164 
165 			/* The page already has 1 ref. */
166 			page_ref_add(page, INT_MAX - 1);
167 			rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX;
168 		}
169 	}
170 
171 	return slots;
172 
173 alloc_err_qpl:
174 	/* Fully free the copy pool pages. */
175 	while (j--) {
176 		page_ref_sub(rx->qpl_copy_pool[j].page,
177 			     rx->qpl_copy_pool[j].pagecnt_bias - 1);
178 		put_page(rx->qpl_copy_pool[j].page);
179 	}
180 
181 	/* Do not fully free QPL pages - only remove the bias added in this
182 	 * function with gve_setup_rx_buffer.
183 	 */
184 	while (i--)
185 		page_ref_sub(rx->data.page_info[i].page,
186 			     rx->data.page_info[i].pagecnt_bias - 1);
187 
188 	gve_unassign_qpl(priv, rx->data.qpl->id);
189 	rx->data.qpl = NULL;
190 
191 	return err;
192 
193 alloc_err_rda:
194 	while (i--)
195 		gve_rx_free_buffer(&priv->pdev->dev,
196 				   &rx->data.page_info[i],
197 				   &rx->data.data_ring[i]);
198 	return err;
199 }
200 
201 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx)
202 {
203 	ctx->skb_head = NULL;
204 	ctx->skb_tail = NULL;
205 	ctx->total_size = 0;
206 	ctx->frag_cnt = 0;
207 	ctx->drop_pkt = false;
208 }
209 
210 static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)
211 {
212 	struct gve_rx_ring *rx = &priv->rx[idx];
213 	struct device *hdev = &priv->pdev->dev;
214 	u32 slots, npages;
215 	int filled_pages;
216 	size_t bytes;
217 	int err;
218 
219 	netif_dbg(priv, drv, priv->dev, "allocating rx ring\n");
220 	/* Make sure everything is zeroed to start with */
221 	memset(rx, 0, sizeof(*rx));
222 
223 	rx->gve = priv;
224 	rx->q_num = idx;
225 
226 	slots = priv->rx_data_slot_cnt;
227 	rx->mask = slots - 1;
228 	rx->data.raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT;
229 
230 	/* alloc rx data ring */
231 	bytes = sizeof(*rx->data.data_ring) * slots;
232 	rx->data.data_ring = dma_alloc_coherent(hdev, bytes,
233 						&rx->data.data_bus,
234 						GFP_KERNEL);
235 	if (!rx->data.data_ring)
236 		return -ENOMEM;
237 
238 	rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1;
239 	rx->qpl_copy_pool_head = 0;
240 	rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1,
241 				     sizeof(rx->qpl_copy_pool[0]),
242 				     GFP_KERNEL);
243 
244 	if (!rx->qpl_copy_pool) {
245 		err = -ENOMEM;
246 		goto abort_with_slots;
247 	}
248 
249 	filled_pages = gve_prefill_rx_pages(rx);
250 	if (filled_pages < 0) {
251 		err = -ENOMEM;
252 		goto abort_with_copy_pool;
253 	}
254 	rx->fill_cnt = filled_pages;
255 	/* Ensure data ring slots (packet buffers) are visible. */
256 	dma_wmb();
257 
258 	/* Alloc gve_queue_resources */
259 	rx->q_resources =
260 		dma_alloc_coherent(hdev,
261 				   sizeof(*rx->q_resources),
262 				   &rx->q_resources_bus,
263 				   GFP_KERNEL);
264 	if (!rx->q_resources) {
265 		err = -ENOMEM;
266 		goto abort_filled;
267 	}
268 	netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx,
269 		  (unsigned long)rx->data.data_bus);
270 
271 	/* alloc rx desc ring */
272 	bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt;
273 	npages = bytes / PAGE_SIZE;
274 	if (npages * PAGE_SIZE != bytes) {
275 		err = -EIO;
276 		goto abort_with_q_resources;
277 	}
278 
279 	rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus,
280 						GFP_KERNEL);
281 	if (!rx->desc.desc_ring) {
282 		err = -ENOMEM;
283 		goto abort_with_q_resources;
284 	}
285 	rx->cnt = 0;
286 	rx->db_threshold = priv->rx_desc_cnt / 2;
287 	rx->desc.seqno = 1;
288 
289 	/* Allocating half-page buffers allows page-flipping which is faster
290 	 * than copying or allocating new pages.
291 	 */
292 	rx->packet_buffer_size = PAGE_SIZE / 2;
293 	gve_rx_ctx_clear(&rx->ctx);
294 	gve_rx_add_to_block(priv, idx);
295 
296 	return 0;
297 
298 abort_with_q_resources:
299 	dma_free_coherent(hdev, sizeof(*rx->q_resources),
300 			  rx->q_resources, rx->q_resources_bus);
301 	rx->q_resources = NULL;
302 abort_filled:
303 	gve_rx_unfill_pages(priv, rx);
304 abort_with_copy_pool:
305 	kvfree(rx->qpl_copy_pool);
306 	rx->qpl_copy_pool = NULL;
307 abort_with_slots:
308 	bytes = sizeof(*rx->data.data_ring) * slots;
309 	dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus);
310 	rx->data.data_ring = NULL;
311 
312 	return err;
313 }
314 
315 int gve_rx_alloc_rings(struct gve_priv *priv)
316 {
317 	int err = 0;
318 	int i;
319 
320 	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
321 		err = gve_rx_alloc_ring(priv, i);
322 		if (err) {
323 			netif_err(priv, drv, priv->dev,
324 				  "Failed to alloc rx ring=%d: err=%d\n",
325 				  i, err);
326 			break;
327 		}
328 	}
329 	/* Unallocate if there was an error */
330 	if (err) {
331 		int j;
332 
333 		for (j = 0; j < i; j++)
334 			gve_rx_free_ring(priv, j);
335 	}
336 	return err;
337 }
338 
339 void gve_rx_free_rings_gqi(struct gve_priv *priv)
340 {
341 	int i;
342 
343 	for (i = 0; i < priv->rx_cfg.num_queues; i++)
344 		gve_rx_free_ring(priv, i);
345 }
346 
347 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx)
348 {
349 	u32 db_idx = be32_to_cpu(rx->q_resources->db_index);
350 
351 	iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]);
352 }
353 
354 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags)
355 {
356 	if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP)))
357 		return PKT_HASH_TYPE_L4;
358 	if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6))
359 		return PKT_HASH_TYPE_L3;
360 	return PKT_HASH_TYPE_L2;
361 }
362 
363 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi,
364 					struct gve_rx_slot_page_info *page_info,
365 					u16 packet_buffer_size, u16 len,
366 					struct gve_rx_ctx *ctx)
367 {
368 	u32 offset = page_info->page_offset + page_info->pad;
369 	struct sk_buff *skb = ctx->skb_tail;
370 	int num_frags = 0;
371 
372 	if (!skb) {
373 		skb = napi_get_frags(napi);
374 		if (unlikely(!skb))
375 			return NULL;
376 
377 		ctx->skb_head = skb;
378 		ctx->skb_tail = skb;
379 	} else {
380 		num_frags = skb_shinfo(ctx->skb_tail)->nr_frags;
381 		if (num_frags == MAX_SKB_FRAGS) {
382 			skb = napi_alloc_skb(napi, 0);
383 			if (!skb)
384 				return NULL;
385 
386 			// We will never chain more than two SKBs: 2 * 16 * 2k > 64k
387 			// which is why we do not need to chain by using skb->next
388 			skb_shinfo(ctx->skb_tail)->frag_list = skb;
389 
390 			ctx->skb_tail = skb;
391 			num_frags = 0;
392 		}
393 	}
394 
395 	if (skb != ctx->skb_head) {
396 		ctx->skb_head->len += len;
397 		ctx->skb_head->data_len += len;
398 		ctx->skb_head->truesize += packet_buffer_size;
399 	}
400 	skb_add_rx_frag(skb, num_frags, page_info->page,
401 			offset, len, packet_buffer_size);
402 
403 	return ctx->skb_head;
404 }
405 
406 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr)
407 {
408 	const __be64 offset = cpu_to_be64(PAGE_SIZE / 2);
409 
410 	/* "flip" to other packet buffer on this page */
411 	page_info->page_offset ^= PAGE_SIZE / 2;
412 	*(slot_addr) ^= offset;
413 }
414 
415 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info)
416 {
417 	int pagecount = page_count(page_info->page);
418 
419 	/* This page is not being used by any SKBs - reuse */
420 	if (pagecount == page_info->pagecnt_bias)
421 		return 1;
422 	/* This page is still being used by an SKB - we can't reuse */
423 	else if (pagecount > page_info->pagecnt_bias)
424 		return 0;
425 	WARN(pagecount < page_info->pagecnt_bias,
426 	     "Pagecount should never be less than the bias.");
427 	return -1;
428 }
429 
430 static struct sk_buff *
431 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev,
432 		      struct gve_rx_slot_page_info *page_info, u16 len,
433 		      struct napi_struct *napi,
434 		      union gve_rx_data_slot *data_slot,
435 		      u16 packet_buffer_size, struct gve_rx_ctx *ctx)
436 {
437 	struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx);
438 
439 	if (!skb)
440 		return NULL;
441 
442 	/* Optimistically stop the kernel from freeing the page.
443 	 * We will check again in refill to determine if we need to alloc a
444 	 * new page.
445 	 */
446 	gve_dec_pagecnt_bias(page_info);
447 
448 	return skb;
449 }
450 
451 static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx,
452 					   struct gve_rx_slot_page_info *page_info,
453 					   u16 len, struct napi_struct *napi)
454 {
455 	u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask;
456 	void *src = page_info->page_address + page_info->page_offset;
457 	struct gve_rx_slot_page_info *copy_page_info;
458 	struct gve_rx_ctx *ctx = &rx->ctx;
459 	bool alloc_page = false;
460 	struct sk_buff *skb;
461 	void *dst;
462 
463 	copy_page_info = &rx->qpl_copy_pool[pool_idx];
464 	if (!copy_page_info->can_flip) {
465 		int recycle = gve_rx_can_recycle_buffer(copy_page_info);
466 
467 		if (unlikely(recycle < 0)) {
468 			gve_schedule_reset(rx->gve);
469 			return NULL;
470 		}
471 		alloc_page = !recycle;
472 	}
473 
474 	if (alloc_page) {
475 		struct gve_rx_slot_page_info alloc_page_info;
476 		struct page *page;
477 
478 		/* The least recently used page turned out to be
479 		 * still in use by the kernel. Ignoring it and moving
480 		 * on alleviates head-of-line blocking.
481 		 */
482 		rx->qpl_copy_pool_head++;
483 
484 		page = alloc_page(GFP_ATOMIC);
485 		if (!page)
486 			return NULL;
487 
488 		alloc_page_info.page = page;
489 		alloc_page_info.page_offset = 0;
490 		alloc_page_info.page_address = page_address(page);
491 		alloc_page_info.pad = page_info->pad;
492 
493 		memcpy(alloc_page_info.page_address, src, page_info->pad + len);
494 		skb = gve_rx_add_frags(napi, &alloc_page_info,
495 				       rx->packet_buffer_size,
496 				       len, ctx);
497 
498 		u64_stats_update_begin(&rx->statss);
499 		rx->rx_frag_copy_cnt++;
500 		rx->rx_frag_alloc_cnt++;
501 		u64_stats_update_end(&rx->statss);
502 
503 		return skb;
504 	}
505 
506 	dst = copy_page_info->page_address + copy_page_info->page_offset;
507 	memcpy(dst, src, page_info->pad + len);
508 	copy_page_info->pad = page_info->pad;
509 
510 	skb = gve_rx_add_frags(napi, copy_page_info,
511 			       rx->packet_buffer_size, len, ctx);
512 	if (unlikely(!skb))
513 		return NULL;
514 
515 	gve_dec_pagecnt_bias(copy_page_info);
516 	copy_page_info->page_offset += rx->packet_buffer_size;
517 	copy_page_info->page_offset &= (PAGE_SIZE - 1);
518 
519 	if (copy_page_info->can_flip) {
520 		/* We have used both halves of this copy page, it
521 		 * is time for it to go to the back of the queue.
522 		 */
523 		copy_page_info->can_flip = false;
524 		rx->qpl_copy_pool_head++;
525 		prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page);
526 	} else {
527 		copy_page_info->can_flip = true;
528 	}
529 
530 	u64_stats_update_begin(&rx->statss);
531 	rx->rx_frag_copy_cnt++;
532 	u64_stats_update_end(&rx->statss);
533 
534 	return skb;
535 }
536 
537 static struct sk_buff *
538 gve_rx_qpl(struct device *dev, struct net_device *netdev,
539 	   struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info,
540 	   u16 len, struct napi_struct *napi,
541 	   union gve_rx_data_slot *data_slot)
542 {
543 	struct gve_rx_ctx *ctx = &rx->ctx;
544 	struct sk_buff *skb;
545 
546 	/* if raw_addressing mode is not enabled gvnic can only receive into
547 	 * registered segments. If the buffer can't be recycled, our only
548 	 * choice is to copy the data out of it so that we can return it to the
549 	 * device.
550 	 */
551 	if (page_info->can_flip) {
552 		skb = gve_rx_add_frags(napi, page_info, rx->packet_buffer_size, len, ctx);
553 		/* No point in recycling if we didn't get the skb */
554 		if (skb) {
555 			/* Make sure that the page isn't freed. */
556 			gve_dec_pagecnt_bias(page_info);
557 			gve_rx_flip_buff(page_info, &data_slot->qpl_offset);
558 		}
559 	} else {
560 		skb = gve_rx_copy_to_pool(rx, page_info, len, napi);
561 	}
562 	return skb;
563 }
564 
565 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx,
566 				  struct gve_rx_slot_page_info *page_info, struct napi_struct *napi,
567 				  u16 len, union gve_rx_data_slot *data_slot,
568 				  bool is_only_frag)
569 {
570 	struct net_device *netdev = priv->dev;
571 	struct gve_rx_ctx *ctx = &rx->ctx;
572 	struct sk_buff *skb = NULL;
573 
574 	if (len <= priv->rx_copybreak && is_only_frag)  {
575 		/* Just copy small packets */
576 		skb = gve_rx_copy(netdev, napi, page_info, len);
577 		if (skb) {
578 			u64_stats_update_begin(&rx->statss);
579 			rx->rx_copied_pkt++;
580 			rx->rx_frag_copy_cnt++;
581 			rx->rx_copybreak_pkt++;
582 			u64_stats_update_end(&rx->statss);
583 		}
584 	} else {
585 		int recycle = gve_rx_can_recycle_buffer(page_info);
586 
587 		if (unlikely(recycle < 0)) {
588 			gve_schedule_reset(priv);
589 			return NULL;
590 		}
591 		page_info->can_flip = recycle;
592 		if (page_info->can_flip) {
593 			u64_stats_update_begin(&rx->statss);
594 			rx->rx_frag_flip_cnt++;
595 			u64_stats_update_end(&rx->statss);
596 		}
597 
598 		if (rx->data.raw_addressing) {
599 			skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev,
600 						    page_info, len, napi,
601 						    data_slot,
602 						    rx->packet_buffer_size, ctx);
603 		} else {
604 			skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx,
605 					 page_info, len, napi, data_slot);
606 		}
607 	}
608 	return skb;
609 }
610 
611 static int gve_xsk_pool_redirect(struct net_device *dev,
612 				 struct gve_rx_ring *rx,
613 				 void *data, int len,
614 				 struct bpf_prog *xdp_prog)
615 {
616 	struct xdp_buff *xdp;
617 	int err;
618 
619 	if (rx->xsk_pool->frame_len < len)
620 		return -E2BIG;
621 	xdp = xsk_buff_alloc(rx->xsk_pool);
622 	if (!xdp) {
623 		u64_stats_update_begin(&rx->statss);
624 		rx->xdp_alloc_fails++;
625 		u64_stats_update_end(&rx->statss);
626 		return -ENOMEM;
627 	}
628 	xdp->data_end = xdp->data + len;
629 	memcpy(xdp->data, data, len);
630 	err = xdp_do_redirect(dev, xdp, xdp_prog);
631 	if (err)
632 		xsk_buff_free(xdp);
633 	return err;
634 }
635 
636 static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx,
637 			    struct xdp_buff *orig, struct bpf_prog *xdp_prog)
638 {
639 	int total_len, len = orig->data_end - orig->data;
640 	int headroom = XDP_PACKET_HEADROOM;
641 	struct xdp_buff new;
642 	void *frame;
643 	int err;
644 
645 	if (rx->xsk_pool)
646 		return gve_xsk_pool_redirect(dev, rx, orig->data,
647 					     len, xdp_prog);
648 
649 	total_len = headroom + SKB_DATA_ALIGN(len) +
650 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
651 	frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
652 	if (!frame) {
653 		u64_stats_update_begin(&rx->statss);
654 		rx->xdp_alloc_fails++;
655 		u64_stats_update_end(&rx->statss);
656 		return -ENOMEM;
657 	}
658 	xdp_init_buff(&new, total_len, &rx->xdp_rxq);
659 	xdp_prepare_buff(&new, frame, headroom, len, false);
660 	memcpy(new.data, orig->data, len);
661 
662 	err = xdp_do_redirect(dev, &new, xdp_prog);
663 	if (err)
664 		page_frag_free(frame);
665 
666 	return err;
667 }
668 
669 static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx,
670 			 struct xdp_buff *xdp, struct bpf_prog *xprog,
671 			 int xdp_act)
672 {
673 	struct gve_tx_ring *tx;
674 	int tx_qid;
675 	int err;
676 
677 	switch (xdp_act) {
678 	case XDP_ABORTED:
679 	case XDP_DROP:
680 	default:
681 		break;
682 	case XDP_TX:
683 		tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num);
684 		tx = &priv->tx[tx_qid];
685 		spin_lock(&tx->xdp_lock);
686 		err = gve_xdp_xmit_one(priv, tx, xdp->data,
687 				       xdp->data_end - xdp->data, NULL);
688 		spin_unlock(&tx->xdp_lock);
689 
690 		if (unlikely(err)) {
691 			u64_stats_update_begin(&rx->statss);
692 			rx->xdp_tx_errors++;
693 			u64_stats_update_end(&rx->statss);
694 		}
695 		break;
696 	case XDP_REDIRECT:
697 		err = gve_xdp_redirect(priv->dev, rx, xdp, xprog);
698 
699 		if (unlikely(err)) {
700 			u64_stats_update_begin(&rx->statss);
701 			rx->xdp_redirect_errors++;
702 			u64_stats_update_end(&rx->statss);
703 		}
704 		break;
705 	}
706 	u64_stats_update_begin(&rx->statss);
707 	if ((u32)xdp_act < GVE_XDP_ACTIONS)
708 		rx->xdp_actions[xdp_act]++;
709 	u64_stats_update_end(&rx->statss);
710 }
711 
712 #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x))
713 static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat,
714 		   struct gve_rx_desc *desc, u32 idx,
715 		   struct gve_rx_cnts *cnts)
716 {
717 	bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq);
718 	struct gve_rx_slot_page_info *page_info;
719 	u16 frag_size = be16_to_cpu(desc->len);
720 	struct gve_rx_ctx *ctx = &rx->ctx;
721 	union gve_rx_data_slot *data_slot;
722 	struct gve_priv *priv = rx->gve;
723 	struct sk_buff *skb = NULL;
724 	struct bpf_prog *xprog;
725 	struct xdp_buff xdp;
726 	dma_addr_t page_bus;
727 	void *va;
728 
729 	u16 len = frag_size;
730 	struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
731 	bool is_first_frag = ctx->frag_cnt == 0;
732 
733 	bool is_only_frag = is_first_frag && is_last_frag;
734 
735 	if (unlikely(ctx->drop_pkt))
736 		goto finish_frag;
737 
738 	if (desc->flags_seq & GVE_RXF_ERR) {
739 		ctx->drop_pkt = true;
740 		cnts->desc_err_pkt_cnt++;
741 		napi_free_frags(napi);
742 		goto finish_frag;
743 	}
744 
745 	if (unlikely(frag_size > rx->packet_buffer_size)) {
746 		netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset",
747 			    frag_size, rx->packet_buffer_size);
748 		ctx->drop_pkt = true;
749 		napi_free_frags(napi);
750 		gve_schedule_reset(rx->gve);
751 		goto finish_frag;
752 	}
753 
754 	/* Prefetch two packet buffers ahead, we will need it soon. */
755 	page_info = &rx->data.page_info[(idx + 2) & rx->mask];
756 	va = page_info->page_address + page_info->page_offset;
757 	prefetch(page_info->page); /* Kernel page struct. */
758 	prefetch(va);              /* Packet header. */
759 	prefetch(va + 64);         /* Next cacheline too. */
760 
761 	page_info = &rx->data.page_info[idx];
762 	data_slot = &rx->data.data_ring[idx];
763 	page_bus = (rx->data.raw_addressing) ?
764 		be64_to_cpu(data_slot->addr) - page_info->page_offset :
765 		rx->data.qpl->page_buses[idx];
766 	dma_sync_single_for_cpu(&priv->pdev->dev, page_bus,
767 				PAGE_SIZE, DMA_FROM_DEVICE);
768 	page_info->pad = is_first_frag ? GVE_RX_PAD : 0;
769 	len -= page_info->pad;
770 	frag_size -= page_info->pad;
771 
772 	xprog = READ_ONCE(priv->xdp_prog);
773 	if (xprog && is_only_frag) {
774 		void *old_data;
775 		int xdp_act;
776 
777 		xdp_init_buff(&xdp, rx->packet_buffer_size, &rx->xdp_rxq);
778 		xdp_prepare_buff(&xdp, page_info->page_address +
779 				 page_info->page_offset, GVE_RX_PAD,
780 				 len, false);
781 		old_data = xdp.data;
782 		xdp_act = bpf_prog_run_xdp(xprog, &xdp);
783 		if (xdp_act != XDP_PASS) {
784 			gve_xdp_done(priv, rx, &xdp, xprog, xdp_act);
785 			ctx->total_size += frag_size;
786 			goto finish_ok_pkt;
787 		}
788 
789 		page_info->pad += xdp.data - old_data;
790 		len = xdp.data_end - xdp.data;
791 
792 		u64_stats_update_begin(&rx->statss);
793 		rx->xdp_actions[XDP_PASS]++;
794 		u64_stats_update_end(&rx->statss);
795 	}
796 
797 	skb = gve_rx_skb(priv, rx, page_info, napi, len,
798 			 data_slot, is_only_frag);
799 	if (!skb) {
800 		u64_stats_update_begin(&rx->statss);
801 		rx->rx_skb_alloc_fail++;
802 		u64_stats_update_end(&rx->statss);
803 
804 		napi_free_frags(napi);
805 		ctx->drop_pkt = true;
806 		goto finish_frag;
807 	}
808 	ctx->total_size += frag_size;
809 
810 	if (is_first_frag) {
811 		if (likely(feat & NETIF_F_RXCSUM)) {
812 			/* NIC passes up the partial sum */
813 			if (desc->csum)
814 				skb->ip_summed = CHECKSUM_COMPLETE;
815 			else
816 				skb->ip_summed = CHECKSUM_NONE;
817 			skb->csum = csum_unfold(desc->csum);
818 		}
819 
820 		/* parse flags & pass relevant info up */
821 		if (likely(feat & NETIF_F_RXHASH) &&
822 		    gve_needs_rss(desc->flags_seq))
823 			skb_set_hash(skb, be32_to_cpu(desc->rss_hash),
824 				     gve_rss_type(desc->flags_seq));
825 	}
826 
827 	if (is_last_frag) {
828 		skb_record_rx_queue(skb, rx->q_num);
829 		if (skb_is_nonlinear(skb))
830 			napi_gro_frags(napi);
831 		else
832 			napi_gro_receive(napi, skb);
833 		goto finish_ok_pkt;
834 	}
835 
836 	goto finish_frag;
837 
838 finish_ok_pkt:
839 	cnts->ok_pkt_bytes += ctx->total_size;
840 	cnts->ok_pkt_cnt++;
841 finish_frag:
842 	ctx->frag_cnt++;
843 	if (is_last_frag) {
844 		cnts->total_pkt_cnt++;
845 		cnts->cont_pkt_cnt += (ctx->frag_cnt > 1);
846 		gve_rx_ctx_clear(ctx);
847 	}
848 }
849 
850 bool gve_rx_work_pending(struct gve_rx_ring *rx)
851 {
852 	struct gve_rx_desc *desc;
853 	__be16 flags_seq;
854 	u32 next_idx;
855 
856 	next_idx = rx->cnt & rx->mask;
857 	desc = rx->desc.desc_ring + next_idx;
858 
859 	flags_seq = desc->flags_seq;
860 
861 	return (GVE_SEQNO(flags_seq) == rx->desc.seqno);
862 }
863 
864 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)
865 {
866 	int refill_target = rx->mask + 1;
867 	u32 fill_cnt = rx->fill_cnt;
868 
869 	while (fill_cnt - rx->cnt < refill_target) {
870 		struct gve_rx_slot_page_info *page_info;
871 		u32 idx = fill_cnt & rx->mask;
872 
873 		page_info = &rx->data.page_info[idx];
874 		if (page_info->can_flip) {
875 			/* The other half of the page is free because it was
876 			 * free when we processed the descriptor. Flip to it.
877 			 */
878 			union gve_rx_data_slot *data_slot =
879 						&rx->data.data_ring[idx];
880 
881 			gve_rx_flip_buff(page_info, &data_slot->addr);
882 			page_info->can_flip = 0;
883 		} else {
884 			/* It is possible that the networking stack has already
885 			 * finished processing all outstanding packets in the buffer
886 			 * and it can be reused.
887 			 * Flipping is unnecessary here - if the networking stack still
888 			 * owns half the page it is impossible to tell which half. Either
889 			 * the whole page is free or it needs to be replaced.
890 			 */
891 			int recycle = gve_rx_can_recycle_buffer(page_info);
892 
893 			if (recycle < 0) {
894 				if (!rx->data.raw_addressing)
895 					gve_schedule_reset(priv);
896 				return false;
897 			}
898 			if (!recycle) {
899 				/* We can't reuse the buffer - alloc a new one*/
900 				union gve_rx_data_slot *data_slot =
901 						&rx->data.data_ring[idx];
902 				struct device *dev = &priv->pdev->dev;
903 				gve_rx_free_buffer(dev, page_info, data_slot);
904 				page_info->page = NULL;
905 				if (gve_rx_alloc_buffer(priv, dev, page_info,
906 							data_slot)) {
907 					u64_stats_update_begin(&rx->statss);
908 					rx->rx_buf_alloc_fail++;
909 					u64_stats_update_end(&rx->statss);
910 					break;
911 				}
912 			}
913 		}
914 		fill_cnt++;
915 	}
916 	rx->fill_cnt = fill_cnt;
917 	return true;
918 }
919 
920 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
921 			     netdev_features_t feat)
922 {
923 	u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT];
924 	u64 xdp_txs = rx->xdp_actions[XDP_TX];
925 	struct gve_rx_ctx *ctx = &rx->ctx;
926 	struct gve_priv *priv = rx->gve;
927 	struct gve_rx_cnts cnts = {0};
928 	struct gve_rx_desc *next_desc;
929 	u32 idx = rx->cnt & rx->mask;
930 	u32 work_done = 0;
931 
932 	struct gve_rx_desc *desc = &rx->desc.desc_ring[idx];
933 
934 	// Exceed budget only if (and till) the inflight packet is consumed.
935 	while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) &&
936 	       (work_done < budget || ctx->frag_cnt)) {
937 		next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask];
938 		prefetch(next_desc);
939 
940 		gve_rx(rx, feat, desc, idx, &cnts);
941 
942 		rx->cnt++;
943 		idx = rx->cnt & rx->mask;
944 		desc = &rx->desc.desc_ring[idx];
945 		rx->desc.seqno = gve_next_seqno(rx->desc.seqno);
946 		work_done++;
947 	}
948 
949 	// The device will only send whole packets.
950 	if (unlikely(ctx->frag_cnt)) {
951 		struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
952 
953 		napi_free_frags(napi);
954 		gve_rx_ctx_clear(&rx->ctx);
955 		netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset",
956 			    GVE_SEQNO(desc->flags_seq), rx->desc.seqno);
957 		gve_schedule_reset(rx->gve);
958 	}
959 
960 	if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold)
961 		return 0;
962 
963 	if (work_done) {
964 		u64_stats_update_begin(&rx->statss);
965 		rx->rpackets += cnts.ok_pkt_cnt;
966 		rx->rbytes += cnts.ok_pkt_bytes;
967 		rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt;
968 		rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt;
969 		u64_stats_update_end(&rx->statss);
970 	}
971 
972 	if (xdp_txs != rx->xdp_actions[XDP_TX])
973 		gve_xdp_tx_flush(priv, rx->q_num);
974 
975 	if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT])
976 		xdp_do_flush();
977 
978 	/* restock ring slots */
979 	if (!rx->data.raw_addressing) {
980 		/* In QPL mode buffs are refilled as the desc are processed */
981 		rx->fill_cnt += work_done;
982 	} else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) {
983 		/* In raw addressing mode buffs are only refilled if the avail
984 		 * falls below a threshold.
985 		 */
986 		if (!gve_rx_refill_buffers(priv, rx))
987 			return 0;
988 
989 		/* If we were not able to completely refill buffers, we'll want
990 		 * to schedule this queue for work again to refill buffers.
991 		 */
992 		if (rx->fill_cnt - rx->cnt <= rx->db_threshold) {
993 			gve_rx_write_doorbell(priv, rx);
994 			return budget;
995 		}
996 	}
997 
998 	gve_rx_write_doorbell(priv, rx);
999 	return cnts.total_pkt_cnt;
1000 }
1001 
1002 int gve_rx_poll(struct gve_notify_block *block, int budget)
1003 {
1004 	struct gve_rx_ring *rx = block->rx;
1005 	netdev_features_t feat;
1006 	int work_done = 0;
1007 
1008 	feat = block->napi.dev->features;
1009 
1010 	/* If budget is 0, do all the work */
1011 	if (budget == 0)
1012 		budget = INT_MAX;
1013 
1014 	if (budget > 0)
1015 		work_done = gve_clean_rx_done(rx, budget, feat);
1016 
1017 	return work_done;
1018 }
1019