xref: /linux/drivers/net/ethernet/google/gve/gve_rx_dqo.c (revision 8f7aa3d3c7323f4ca2768a9e74ebbe359c4f8f88)
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6 
7 #include "gve.h"
8 #include "gve_dqo.h"
9 #include "gve_adminq.h"
10 #include "gve_utils.h"
11 #include <linux/bpf.h>
12 #include <linux/ip.h>
13 #include <linux/ipv6.h>
14 #include <linux/skbuff.h>
15 #include <linux/slab.h>
16 #include <net/ip6_checksum.h>
17 #include <net/ipv6.h>
18 #include <net/tcp.h>
19 #include <net/xdp_sock_drv.h>
20 
21 static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx)
22 {
23 	struct device *hdev = &priv->pdev->dev;
24 	int buf_count = rx->dqo.bufq.mask + 1;
25 
26 	if (rx->dqo.hdr_bufs.data) {
27 		dma_free_coherent(hdev, priv->header_buf_size * buf_count,
28 				  rx->dqo.hdr_bufs.data, rx->dqo.hdr_bufs.addr);
29 		rx->dqo.hdr_bufs.data = NULL;
30 	}
31 }
32 
33 static void gve_rx_init_ring_state_dqo(struct gve_rx_ring *rx,
34 				       const u32 buffer_queue_slots,
35 				       const u32 completion_queue_slots)
36 {
37 	int i;
38 
39 	/* Set buffer queue state */
40 	rx->dqo.bufq.mask = buffer_queue_slots - 1;
41 	rx->dqo.bufq.head = 0;
42 	rx->dqo.bufq.tail = 0;
43 
44 	/* Set completion queue state */
45 	rx->dqo.complq.num_free_slots = completion_queue_slots;
46 	rx->dqo.complq.mask = completion_queue_slots - 1;
47 	rx->dqo.complq.cur_gen_bit = 0;
48 	rx->dqo.complq.head = 0;
49 
50 	/* Set RX SKB context */
51 	rx->ctx.skb_head = NULL;
52 	rx->ctx.skb_tail = NULL;
53 
54 	/* Set up linked list of buffer IDs */
55 	if (rx->dqo.buf_states) {
56 		for (i = 0; i < rx->dqo.num_buf_states - 1; i++)
57 			rx->dqo.buf_states[i].next = i + 1;
58 		rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1;
59 	}
60 
61 	rx->dqo.free_buf_states = 0;
62 	rx->dqo.recycled_buf_states.head = -1;
63 	rx->dqo.recycled_buf_states.tail = -1;
64 	rx->dqo.used_buf_states.head = -1;
65 	rx->dqo.used_buf_states.tail = -1;
66 }
67 
68 static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx)
69 {
70 	struct gve_rx_ring *rx = &priv->rx[idx];
71 	size_t size;
72 	int i;
73 
74 	const u32 buffer_queue_slots = priv->rx_desc_cnt;
75 	const u32 completion_queue_slots = priv->rx_desc_cnt;
76 
77 	/* Reset buffer queue */
78 	if (rx->dqo.bufq.desc_ring) {
79 		size = sizeof(rx->dqo.bufq.desc_ring[0]) *
80 			buffer_queue_slots;
81 		memset(rx->dqo.bufq.desc_ring, 0, size);
82 	}
83 
84 	/* Reset completion queue */
85 	if (rx->dqo.complq.desc_ring) {
86 		size = sizeof(rx->dqo.complq.desc_ring[0]) *
87 			completion_queue_slots;
88 		memset(rx->dqo.complq.desc_ring, 0, size);
89 	}
90 
91 	/* Reset q_resources */
92 	if (rx->q_resources)
93 		memset(rx->q_resources, 0, sizeof(*rx->q_resources));
94 
95 	/* Reset buf states */
96 	if (rx->dqo.buf_states) {
97 		for (i = 0; i < rx->dqo.num_buf_states; i++) {
98 			struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
99 
100 			if (rx->dqo.page_pool)
101 				gve_free_to_page_pool(rx, bs, false);
102 			else
103 				gve_free_qpl_page_dqo(bs);
104 		}
105 	}
106 
107 	gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
108 				   completion_queue_slots);
109 }
110 
111 void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx)
112 {
113 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
114 	struct gve_rx_ring *rx = &priv->rx[idx];
115 
116 	if (!gve_rx_was_added_to_block(priv, idx))
117 		return;
118 
119 	if (rx->dqo.page_pool)
120 		page_pool_disable_direct_recycling(rx->dqo.page_pool);
121 	gve_remove_napi(priv, ntfy_idx);
122 	gve_rx_remove_from_block(priv, idx);
123 	gve_rx_reset_ring_dqo(priv, idx);
124 }
125 
126 void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
127 			  struct gve_rx_alloc_rings_cfg *cfg)
128 {
129 	struct device *hdev = &priv->pdev->dev;
130 	size_t completion_queue_slots;
131 	size_t buffer_queue_slots;
132 	int idx = rx->q_num;
133 	size_t size;
134 	u32 qpl_id;
135 	int i;
136 
137 	completion_queue_slots = rx->dqo.complq.mask + 1;
138 	buffer_queue_slots = rx->dqo.bufq.mask + 1;
139 
140 	if (rx->q_resources) {
141 		dma_free_coherent(hdev, sizeof(*rx->q_resources),
142 				  rx->q_resources, rx->q_resources_bus);
143 		rx->q_resources = NULL;
144 	}
145 
146 	for (i = 0; i < rx->dqo.num_buf_states; i++) {
147 		struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
148 
149 		if (rx->dqo.page_pool)
150 			gve_free_to_page_pool(rx, bs, false);
151 		else
152 			gve_free_qpl_page_dqo(bs);
153 		if (gve_buf_state_is_allocated(rx, bs) && bs->xsk_buff) {
154 			xsk_buff_free(bs->xsk_buff);
155 			bs->xsk_buff = NULL;
156 		}
157 	}
158 
159 	if (rx->dqo.qpl) {
160 		qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
161 		gve_free_queue_page_list(priv, rx->dqo.qpl, qpl_id);
162 		rx->dqo.qpl = NULL;
163 	}
164 
165 	if (rx->dqo.bufq.desc_ring) {
166 		size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
167 		dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring,
168 				  rx->dqo.bufq.bus);
169 		rx->dqo.bufq.desc_ring = NULL;
170 	}
171 
172 	if (rx->dqo.complq.desc_ring) {
173 		size = sizeof(rx->dqo.complq.desc_ring[0]) *
174 			completion_queue_slots;
175 		dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring,
176 				  rx->dqo.complq.bus);
177 		rx->dqo.complq.desc_ring = NULL;
178 	}
179 
180 	kvfree(rx->dqo.buf_states);
181 	rx->dqo.buf_states = NULL;
182 
183 	if (rx->dqo.page_pool) {
184 		page_pool_destroy(rx->dqo.page_pool);
185 		rx->dqo.page_pool = NULL;
186 	}
187 
188 	gve_rx_free_hdr_bufs(priv, rx);
189 
190 	netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
191 }
192 
193 static int gve_rx_alloc_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx,
194 				 const u32 buf_count)
195 {
196 	struct device *hdev = &priv->pdev->dev;
197 
198 	rx->dqo.hdr_bufs.data = dma_alloc_coherent(hdev, priv->header_buf_size * buf_count,
199 						   &rx->dqo.hdr_bufs.addr, GFP_KERNEL);
200 	if (!rx->dqo.hdr_bufs.data)
201 		return -ENOMEM;
202 
203 	return 0;
204 }
205 
206 void gve_rx_start_ring_dqo(struct gve_priv *priv, int idx)
207 {
208 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
209 
210 	gve_rx_add_to_block(priv, idx);
211 	gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo);
212 }
213 
214 int gve_rx_alloc_ring_dqo(struct gve_priv *priv,
215 			  struct gve_rx_alloc_rings_cfg *cfg,
216 			  struct gve_rx_ring *rx,
217 			  int idx)
218 {
219 	struct device *hdev = &priv->pdev->dev;
220 	struct page_pool *pool;
221 	int qpl_page_cnt;
222 	size_t size;
223 	u32 qpl_id;
224 
225 	const u32 buffer_queue_slots = cfg->ring_size;
226 	const u32 completion_queue_slots = cfg->ring_size;
227 
228 	netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n");
229 
230 	memset(rx, 0, sizeof(*rx));
231 	rx->gve = priv;
232 	rx->q_num = idx;
233 	rx->packet_buffer_size = cfg->packet_buffer_size;
234 
235 	if (cfg->xdp) {
236 		rx->packet_buffer_truesize = GVE_XDP_RX_BUFFER_SIZE_DQO;
237 		rx->rx_headroom = XDP_PACKET_HEADROOM;
238 	} else {
239 		rx->packet_buffer_truesize = rx->packet_buffer_size;
240 		rx->rx_headroom = 0;
241 	}
242 
243 	/* struct gve_xdp_buff is overlaid on struct xdp_buff_xsk and utilizes
244 	 * the 24 byte field cb to store gve specific data.
245 	 */
246 	XSK_CHECK_PRIV_TYPE(struct gve_xdp_buff);
247 
248 	rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots :
249 		gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
250 	rx->dqo.buf_states = kvcalloc_node(rx->dqo.num_buf_states,
251 					   sizeof(rx->dqo.buf_states[0]),
252 					   GFP_KERNEL, priv->numa_node);
253 	if (!rx->dqo.buf_states)
254 		return -ENOMEM;
255 
256 	/* Allocate header buffers for header-split */
257 	if (cfg->enable_header_split)
258 		if (gve_rx_alloc_hdr_bufs(priv, rx, buffer_queue_slots))
259 			goto err;
260 
261 	/* Allocate RX completion queue */
262 	size = sizeof(rx->dqo.complq.desc_ring[0]) *
263 		completion_queue_slots;
264 	rx->dqo.complq.desc_ring =
265 		dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL);
266 	if (!rx->dqo.complq.desc_ring)
267 		goto err;
268 
269 	/* Allocate RX buffer queue */
270 	size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
271 	rx->dqo.bufq.desc_ring =
272 		dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL);
273 	if (!rx->dqo.bufq.desc_ring)
274 		goto err;
275 
276 	if (cfg->raw_addressing) {
277 		pool = gve_rx_create_page_pool(priv, rx, cfg->xdp);
278 		if (IS_ERR(pool))
279 			goto err;
280 
281 		rx->dqo.page_pool = pool;
282 	} else {
283 		qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
284 		qpl_page_cnt = gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
285 
286 		rx->dqo.qpl = gve_alloc_queue_page_list(priv, qpl_id,
287 							qpl_page_cnt);
288 		if (!rx->dqo.qpl)
289 			goto err;
290 		rx->dqo.next_qpl_page_idx = 0;
291 	}
292 
293 	rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources),
294 					     &rx->q_resources_bus, GFP_KERNEL);
295 	if (!rx->q_resources)
296 		goto err;
297 
298 	gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
299 				   completion_queue_slots);
300 
301 	return 0;
302 
303 err:
304 	gve_rx_free_ring_dqo(priv, rx, cfg);
305 	return -ENOMEM;
306 }
307 
308 void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx)
309 {
310 	const struct gve_rx_ring *rx = &priv->rx[queue_idx];
311 	u64 index = be32_to_cpu(rx->q_resources->db_index);
312 
313 	iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]);
314 }
315 
316 int gve_rx_alloc_rings_dqo(struct gve_priv *priv,
317 			   struct gve_rx_alloc_rings_cfg *cfg)
318 {
319 	struct gve_rx_ring *rx;
320 	int err;
321 	int i;
322 
323 	rx = kvcalloc(cfg->qcfg_rx->max_queues, sizeof(struct gve_rx_ring),
324 		      GFP_KERNEL);
325 	if (!rx)
326 		return -ENOMEM;
327 
328 	for (i = 0; i < cfg->qcfg_rx->num_queues; i++) {
329 		err = gve_rx_alloc_ring_dqo(priv, cfg, &rx[i], i);
330 		if (err) {
331 			netif_err(priv, drv, priv->dev,
332 				  "Failed to alloc rx ring=%d: err=%d\n",
333 				  i, err);
334 			goto err;
335 		}
336 	}
337 
338 	cfg->rx = rx;
339 	return 0;
340 
341 err:
342 	for (i--; i >= 0; i--)
343 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
344 	kvfree(rx);
345 	return err;
346 }
347 
348 void gve_rx_free_rings_dqo(struct gve_priv *priv,
349 			   struct gve_rx_alloc_rings_cfg *cfg)
350 {
351 	struct gve_rx_ring *rx = cfg->rx;
352 	int i;
353 
354 	if (!rx)
355 		return;
356 
357 	for (i = 0; i < cfg->qcfg_rx->num_queues;  i++)
358 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
359 
360 	kvfree(rx);
361 	cfg->rx = NULL;
362 }
363 
364 void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx)
365 {
366 	struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
367 	struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
368 	struct gve_priv *priv = rx->gve;
369 	u32 num_avail_slots;
370 	u32 num_full_slots;
371 	u32 num_posted = 0;
372 
373 	num_full_slots = (bufq->tail - bufq->head) & bufq->mask;
374 	num_avail_slots = bufq->mask - num_full_slots;
375 
376 	num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots);
377 	while (num_posted < num_avail_slots) {
378 		struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail];
379 
380 		if (unlikely(gve_alloc_buffer(rx, desc))) {
381 			u64_stats_update_begin(&rx->statss);
382 			rx->rx_buf_alloc_fail++;
383 			u64_stats_update_end(&rx->statss);
384 			break;
385 		}
386 
387 		if (rx->dqo.hdr_bufs.data)
388 			desc->header_buf_addr =
389 				cpu_to_le64(rx->dqo.hdr_bufs.addr +
390 					    priv->header_buf_size * bufq->tail);
391 
392 		bufq->tail = (bufq->tail + 1) & bufq->mask;
393 		complq->num_free_slots--;
394 		num_posted++;
395 
396 		if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0)
397 			gve_rx_write_doorbell_dqo(priv, rx->q_num);
398 	}
399 
400 	rx->fill_cnt += num_posted;
401 }
402 
403 static void gve_rx_skb_csum(struct sk_buff *skb,
404 			    const struct gve_rx_compl_desc_dqo *desc,
405 			    struct gve_ptype ptype)
406 {
407 	skb->ip_summed = CHECKSUM_NONE;
408 
409 	/* HW did not identify and process L3 and L4 headers. */
410 	if (unlikely(!desc->l3_l4_processed))
411 		return;
412 
413 	if (ptype.l3_type == GVE_L3_TYPE_IPV4) {
414 		if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err))
415 			return;
416 	} else if (ptype.l3_type == GVE_L3_TYPE_IPV6) {
417 		/* Checksum should be skipped if this flag is set. */
418 		if (unlikely(desc->ipv6_ex_add))
419 			return;
420 	}
421 
422 	if (unlikely(desc->csum_l4_err))
423 		return;
424 
425 	switch (ptype.l4_type) {
426 	case GVE_L4_TYPE_TCP:
427 	case GVE_L4_TYPE_UDP:
428 	case GVE_L4_TYPE_ICMP:
429 	case GVE_L4_TYPE_SCTP:
430 		skb->ip_summed = CHECKSUM_UNNECESSARY;
431 		break;
432 	default:
433 		break;
434 	}
435 }
436 
437 static void gve_rx_skb_hash(struct sk_buff *skb,
438 			    const struct gve_rx_compl_desc_dqo *compl_desc,
439 			    struct gve_ptype ptype)
440 {
441 	enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2;
442 
443 	if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN)
444 		hash_type = PKT_HASH_TYPE_L4;
445 	else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN)
446 		hash_type = PKT_HASH_TYPE_L3;
447 
448 	skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type);
449 }
450 
451 /* Expand the hardware timestamp to the full 64 bits of width, and add it to the
452  * skb.
453  *
454  * This algorithm works by using the passed hardware timestamp to generate a
455  * diff relative to the last read of the nic clock. This diff can be positive or
456  * negative, as it is possible that we have read the clock more recently than
457  * the hardware has received this packet. To detect this, we use the high bit of
458  * the diff, and assume that the read is more recent if the high bit is set. In
459  * this case we invert the process.
460  *
461  * Note that this means if the time delta between packet reception and the last
462  * clock read is greater than ~2 seconds, this will provide invalid results.
463  */
464 static ktime_t gve_rx_get_hwtstamp(struct gve_priv *gve, u32 hwts)
465 {
466 	u64 last_read = READ_ONCE(gve->last_sync_nic_counter);
467 	u32 low = (u32)last_read;
468 	s32 diff = hwts - low;
469 
470 	return ns_to_ktime(last_read + diff);
471 }
472 
473 static void gve_rx_skb_hwtstamp(struct gve_rx_ring *rx,
474 				const struct gve_rx_compl_desc_dqo *desc)
475 {
476 	struct sk_buff *skb = rx->ctx.skb_head;
477 
478 	if (desc->ts_sub_nsecs_low & GVE_DQO_RX_HWTSTAMP_VALID)
479 		skb_hwtstamps(skb)->hwtstamp =
480 			gve_rx_get_hwtstamp(rx->gve, le32_to_cpu(desc->ts));
481 }
482 
483 int gve_xdp_rx_timestamp(const struct xdp_md *_ctx, u64 *timestamp)
484 {
485 	const struct gve_xdp_buff *ctx = (void *)_ctx;
486 
487 	if (!ctx->gve->nic_ts_report)
488 		return -ENODATA;
489 
490 	if (!(ctx->compl_desc->ts_sub_nsecs_low & GVE_DQO_RX_HWTSTAMP_VALID))
491 		return -ENODATA;
492 
493 	*timestamp = gve_rx_get_hwtstamp(ctx->gve,
494 					 le32_to_cpu(ctx->compl_desc->ts));
495 	return 0;
496 }
497 
498 static void gve_rx_free_skb(struct napi_struct *napi, struct gve_rx_ring *rx)
499 {
500 	if (!rx->ctx.skb_head)
501 		return;
502 
503 	if (rx->ctx.skb_head == napi->skb)
504 		napi->skb = NULL;
505 	dev_kfree_skb_any(rx->ctx.skb_head);
506 	rx->ctx.skb_head = NULL;
507 	rx->ctx.skb_tail = NULL;
508 }
509 
510 static bool gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring *rx)
511 {
512 	if (!rx->dqo.qpl)
513 		return false;
514 	if (rx->dqo.used_buf_states_cnt <
515 		     (rx->dqo.num_buf_states -
516 		     GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD))
517 		return false;
518 	return true;
519 }
520 
521 static int gve_rx_copy_ondemand(struct gve_rx_ring *rx,
522 				struct gve_rx_buf_state_dqo *buf_state,
523 				u16 buf_len)
524 {
525 	struct page *page = alloc_pages_node(rx->gve->numa_node, GFP_ATOMIC, 0);
526 	int num_frags;
527 
528 	if (!page)
529 		return -ENOMEM;
530 
531 	memcpy(page_address(page),
532 	       buf_state->page_info.page_address +
533 	       buf_state->page_info.page_offset,
534 	       buf_len);
535 	num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
536 	skb_add_rx_frag(rx->ctx.skb_tail, num_frags, page,
537 			0, buf_len, PAGE_SIZE);
538 
539 	u64_stats_update_begin(&rx->statss);
540 	rx->rx_frag_alloc_cnt++;
541 	u64_stats_update_end(&rx->statss);
542 	/* Return unused buffer. */
543 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
544 	return 0;
545 }
546 
547 static void gve_skb_add_rx_frag(struct gve_rx_ring *rx,
548 				struct gve_rx_buf_state_dqo *buf_state,
549 				int num_frags, u16 buf_len)
550 {
551 	if (rx->dqo.page_pool) {
552 		skb_add_rx_frag_netmem(rx->ctx.skb_tail, num_frags,
553 				       buf_state->page_info.netmem,
554 				       buf_state->page_info.page_offset +
555 				       buf_state->page_info.pad, buf_len,
556 				       buf_state->page_info.buf_size);
557 	} else {
558 		skb_add_rx_frag(rx->ctx.skb_tail, num_frags,
559 				buf_state->page_info.page,
560 				buf_state->page_info.page_offset +
561 				buf_state->page_info.pad, buf_len,
562 				buf_state->page_info.buf_size);
563 	}
564 }
565 
566 /* Chains multi skbs for single rx packet.
567  * Returns 0 if buffer is appended, -1 otherwise.
568  */
569 static int gve_rx_append_frags(struct napi_struct *napi,
570 			       struct gve_rx_buf_state_dqo *buf_state,
571 			       u16 buf_len, struct gve_rx_ring *rx,
572 			       struct gve_priv *priv)
573 {
574 	int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
575 
576 	if (unlikely(num_frags == MAX_SKB_FRAGS)) {
577 		struct sk_buff *skb;
578 
579 		skb = napi_alloc_skb(napi, 0);
580 		if (!skb)
581 			return -1;
582 
583 		if (rx->dqo.page_pool)
584 			skb_mark_for_recycle(skb);
585 
586 		if (rx->ctx.skb_tail == rx->ctx.skb_head)
587 			skb_shinfo(rx->ctx.skb_head)->frag_list = skb;
588 		else
589 			rx->ctx.skb_tail->next = skb;
590 		rx->ctx.skb_tail = skb;
591 		num_frags = 0;
592 	}
593 	if (rx->ctx.skb_tail != rx->ctx.skb_head) {
594 		rx->ctx.skb_head->len += buf_len;
595 		rx->ctx.skb_head->data_len += buf_len;
596 		rx->ctx.skb_head->truesize += buf_state->page_info.buf_size;
597 	}
598 
599 	/* Trigger ondemand page allocation if we are running low on buffers */
600 	if (gve_rx_should_trigger_copy_ondemand(rx))
601 		return gve_rx_copy_ondemand(rx, buf_state, buf_len);
602 
603 	gve_skb_add_rx_frag(rx, buf_state, num_frags, buf_len);
604 	gve_reuse_buffer(rx, buf_state);
605 	return 0;
606 }
607 
608 static int gve_xdp_tx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
609 			  struct xdp_buff *xdp)
610 {
611 	struct gve_tx_ring *tx;
612 	struct xdp_frame *xdpf;
613 	u32 tx_qid;
614 	int err;
615 
616 	xdpf = xdp_convert_buff_to_frame(xdp);
617 	if (unlikely(!xdpf)) {
618 		if (rx->xsk_pool)
619 			xsk_buff_free(xdp);
620 		return -ENOSPC;
621 	}
622 
623 	tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num);
624 	tx = &priv->tx[tx_qid];
625 	spin_lock(&tx->dqo_tx.xdp_lock);
626 	err = gve_xdp_xmit_one_dqo(priv, tx, xdpf);
627 	spin_unlock(&tx->dqo_tx.xdp_lock);
628 
629 	return err;
630 }
631 
632 static void gve_xsk_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
633 			     struct xdp_buff *xdp, struct bpf_prog *xprog,
634 			     int xdp_act)
635 {
636 	switch (xdp_act) {
637 	case XDP_ABORTED:
638 	case XDP_DROP:
639 	default:
640 		xsk_buff_free(xdp);
641 		break;
642 	case XDP_TX:
643 		if (unlikely(gve_xdp_tx_dqo(priv, rx, xdp)))
644 			goto err;
645 		break;
646 	case XDP_REDIRECT:
647 		if (unlikely(xdp_do_redirect(priv->dev, xdp, xprog)))
648 			goto err;
649 		break;
650 	}
651 
652 	u64_stats_update_begin(&rx->statss);
653 	if ((u32)xdp_act < GVE_XDP_ACTIONS)
654 		rx->xdp_actions[xdp_act]++;
655 	u64_stats_update_end(&rx->statss);
656 	return;
657 
658 err:
659 	u64_stats_update_begin(&rx->statss);
660 	if (xdp_act == XDP_TX)
661 		rx->xdp_tx_errors++;
662 	if (xdp_act == XDP_REDIRECT)
663 		rx->xdp_redirect_errors++;
664 	u64_stats_update_end(&rx->statss);
665 }
666 
667 static void gve_xdp_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
668 			     struct xdp_buff *xdp, struct bpf_prog *xprog,
669 			     int xdp_act,
670 			     struct gve_rx_buf_state_dqo *buf_state)
671 {
672 	int err;
673 	switch (xdp_act) {
674 	case XDP_ABORTED:
675 	case XDP_DROP:
676 	default:
677 		gve_free_buffer(rx, buf_state);
678 		break;
679 	case XDP_TX:
680 		err = gve_xdp_tx_dqo(priv, rx, xdp);
681 		if (unlikely(err))
682 			goto err;
683 		gve_reuse_buffer(rx, buf_state);
684 		break;
685 	case XDP_REDIRECT:
686 		err = xdp_do_redirect(priv->dev, xdp, xprog);
687 		if (unlikely(err))
688 			goto err;
689 		gve_reuse_buffer(rx, buf_state);
690 		break;
691 	}
692 	u64_stats_update_begin(&rx->statss);
693 	if ((u32)xdp_act < GVE_XDP_ACTIONS)
694 		rx->xdp_actions[xdp_act]++;
695 	u64_stats_update_end(&rx->statss);
696 	return;
697 err:
698 	u64_stats_update_begin(&rx->statss);
699 	if (xdp_act == XDP_TX)
700 		rx->xdp_tx_errors++;
701 	else if (xdp_act == XDP_REDIRECT)
702 		rx->xdp_redirect_errors++;
703 	u64_stats_update_end(&rx->statss);
704 	gve_free_buffer(rx, buf_state);
705 	return;
706 }
707 
708 static int gve_rx_xsk_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
709 			  const struct gve_rx_compl_desc_dqo *compl_desc,
710 			  struct gve_rx_buf_state_dqo *buf_state,
711 			  struct bpf_prog *xprog)
712 {
713 	struct xdp_buff *xdp = buf_state->xsk_buff;
714 	int buf_len = compl_desc->packet_len;
715 	struct gve_priv *priv = rx->gve;
716 	struct gve_xdp_buff *gve_xdp;
717 	int xdp_act;
718 
719 	xdp->data_end = xdp->data + buf_len;
720 	xsk_buff_dma_sync_for_cpu(xdp);
721 
722 	gve_xdp = (void *)xdp;
723 	gve_xdp->gve = priv;
724 	gve_xdp->compl_desc = compl_desc;
725 
726 	if (xprog) {
727 		xdp_act = bpf_prog_run_xdp(xprog, xdp);
728 		buf_len = xdp->data_end - xdp->data;
729 		if (xdp_act != XDP_PASS) {
730 			gve_xsk_done_dqo(priv, rx, xdp, xprog, xdp_act);
731 			gve_free_buf_state(rx, buf_state);
732 			return 0;
733 		}
734 	}
735 
736 	/* Copy the data to skb */
737 	rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
738 					    xdp->data, buf_len);
739 	if (unlikely(!rx->ctx.skb_head)) {
740 		xsk_buff_free(xdp);
741 		gve_free_buf_state(rx, buf_state);
742 		return -ENOMEM;
743 	}
744 	rx->ctx.skb_tail = rx->ctx.skb_head;
745 
746 	/* Free XSK buffer and Buffer state */
747 	xsk_buff_free(xdp);
748 	gve_free_buf_state(rx, buf_state);
749 
750 	/* Update Stats */
751 	u64_stats_update_begin(&rx->statss);
752 	rx->xdp_actions[XDP_PASS]++;
753 	u64_stats_update_end(&rx->statss);
754 	return 0;
755 }
756 
757 static void gve_dma_sync(struct gve_priv *priv, struct gve_rx_ring *rx,
758 			 struct gve_rx_buf_state_dqo *buf_state, u16 buf_len)
759 {
760 	struct gve_rx_slot_page_info *page_info = &buf_state->page_info;
761 
762 	if (rx->dqo.page_pool) {
763 		page_pool_dma_sync_netmem_for_cpu(rx->dqo.page_pool,
764 						  page_info->netmem,
765 						  page_info->page_offset,
766 						  buf_len);
767 	} else {
768 		dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr,
769 					      page_info->page_offset +
770 					      page_info->pad,
771 					      buf_len, DMA_FROM_DEVICE);
772 	}
773 }
774 
775 /* Returns 0 if descriptor is completed successfully.
776  * Returns -EINVAL if descriptor is invalid.
777  * Returns -ENOMEM if data cannot be copied to skb.
778  */
779 static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
780 		      const struct gve_rx_compl_desc_dqo *compl_desc,
781 		      u32 desc_idx, int queue_idx)
782 {
783 	const u16 buffer_id = le16_to_cpu(compl_desc->buf_id);
784 	const bool hbo = compl_desc->header_buffer_overflow;
785 	const bool eop = compl_desc->end_of_packet != 0;
786 	const bool hsplit = compl_desc->split_header;
787 	struct gve_rx_buf_state_dqo *buf_state;
788 	struct gve_priv *priv = rx->gve;
789 	struct bpf_prog *xprog;
790 	u16 buf_len;
791 	u16 hdr_len;
792 
793 	if (unlikely(buffer_id >= rx->dqo.num_buf_states)) {
794 		net_err_ratelimited("%s: Invalid RX buffer_id=%u\n",
795 				    priv->dev->name, buffer_id);
796 		return -EINVAL;
797 	}
798 	buf_state = &rx->dqo.buf_states[buffer_id];
799 	if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) {
800 		net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n",
801 				    priv->dev->name, buffer_id);
802 		return -EINVAL;
803 	}
804 
805 	if (unlikely(compl_desc->rx_error)) {
806 		gve_free_buffer(rx, buf_state);
807 		return -EINVAL;
808 	}
809 
810 	buf_len = compl_desc->packet_len;
811 	hdr_len = compl_desc->header_len;
812 
813 	xprog = READ_ONCE(priv->xdp_prog);
814 	if (buf_state->xsk_buff)
815 		return gve_rx_xsk_dqo(napi, rx, compl_desc, buf_state, xprog);
816 
817 	/* Page might have not been used for awhile and was likely last written
818 	 * by a different thread.
819 	 */
820 	if (rx->dqo.page_pool) {
821 		if (!netmem_is_net_iov(buf_state->page_info.netmem))
822 			prefetch(netmem_to_page(buf_state->page_info.netmem));
823 	} else {
824 		prefetch(buf_state->page_info.page);
825 	}
826 
827 	/* Copy the header into the skb in the case of header split */
828 	if (hsplit) {
829 		int unsplit = 0;
830 
831 		if (hdr_len && !hbo) {
832 			rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
833 							    rx->dqo.hdr_bufs.data +
834 							    desc_idx * priv->header_buf_size,
835 							    hdr_len);
836 			if (unlikely(!rx->ctx.skb_head))
837 				goto error;
838 			rx->ctx.skb_tail = rx->ctx.skb_head;
839 
840 			if (rx->dqo.page_pool)
841 				skb_mark_for_recycle(rx->ctx.skb_head);
842 		} else {
843 			unsplit = 1;
844 		}
845 		u64_stats_update_begin(&rx->statss);
846 		rx->rx_hsplit_pkt++;
847 		rx->rx_hsplit_unsplit_pkt += unsplit;
848 		rx->rx_hsplit_bytes += hdr_len;
849 		u64_stats_update_end(&rx->statss);
850 	} else if (!rx->ctx.skb_head && rx->dqo.page_pool &&
851 		   netmem_is_net_iov(buf_state->page_info.netmem)) {
852 		/* when header split is disabled, the header went to the packet
853 		 * buffer. If the packet buffer is a net_iov, those can't be
854 		 * easily mapped into the kernel space to access the header
855 		 * required to process the packet.
856 		 */
857 		goto error;
858 	}
859 
860 	/* Sync the portion of dma buffer for CPU to read. */
861 	gve_dma_sync(priv, rx, buf_state, buf_len);
862 
863 	/* Append to current skb if one exists. */
864 	if (rx->ctx.skb_head) {
865 		if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx,
866 						 priv)) != 0) {
867 			goto error;
868 		}
869 		return 0;
870 	}
871 
872 	if (xprog) {
873 		struct gve_xdp_buff gve_xdp;
874 		void *old_data;
875 		int xdp_act;
876 
877 		xdp_init_buff(&gve_xdp.xdp, buf_state->page_info.buf_size,
878 			      &rx->xdp_rxq);
879 		xdp_prepare_buff(&gve_xdp.xdp,
880 				 buf_state->page_info.page_address +
881 				 buf_state->page_info.page_offset,
882 				 buf_state->page_info.pad,
883 				 buf_len, false);
884 		gve_xdp.gve = priv;
885 		gve_xdp.compl_desc = compl_desc;
886 
887 		old_data = gve_xdp.xdp.data;
888 		xdp_act = bpf_prog_run_xdp(xprog, &gve_xdp.xdp);
889 		buf_state->page_info.pad += gve_xdp.xdp.data - old_data;
890 		buf_len = gve_xdp.xdp.data_end - gve_xdp.xdp.data;
891 		if (xdp_act != XDP_PASS) {
892 			gve_xdp_done_dqo(priv, rx, &gve_xdp.xdp, xprog, xdp_act,
893 					 buf_state);
894 			return 0;
895 		}
896 
897 		u64_stats_update_begin(&rx->statss);
898 		rx->xdp_actions[XDP_PASS]++;
899 		u64_stats_update_end(&rx->statss);
900 	}
901 
902 	if (eop && buf_len <= priv->rx_copybreak &&
903 	    !(rx->dqo.page_pool &&
904 	      netmem_is_net_iov(buf_state->page_info.netmem))) {
905 		rx->ctx.skb_head = gve_rx_copy(priv->dev, napi,
906 					       &buf_state->page_info, buf_len);
907 		if (unlikely(!rx->ctx.skb_head))
908 			goto error;
909 		rx->ctx.skb_tail = rx->ctx.skb_head;
910 
911 		u64_stats_update_begin(&rx->statss);
912 		rx->rx_copied_pkt++;
913 		rx->rx_copybreak_pkt++;
914 		u64_stats_update_end(&rx->statss);
915 
916 		gve_free_buffer(rx, buf_state);
917 		return 0;
918 	}
919 
920 	rx->ctx.skb_head = napi_get_frags(napi);
921 	if (unlikely(!rx->ctx.skb_head))
922 		goto error;
923 	rx->ctx.skb_tail = rx->ctx.skb_head;
924 
925 	if (gve_rx_should_trigger_copy_ondemand(rx)) {
926 		if (gve_rx_copy_ondemand(rx, buf_state, buf_len) < 0)
927 			goto error;
928 		return 0;
929 	}
930 
931 	if (rx->dqo.page_pool)
932 		skb_mark_for_recycle(rx->ctx.skb_head);
933 
934 	gve_skb_add_rx_frag(rx, buf_state, 0, buf_len);
935 	gve_reuse_buffer(rx, buf_state);
936 	return 0;
937 
938 error:
939 	gve_free_buffer(rx, buf_state);
940 	return -ENOMEM;
941 }
942 
943 static int gve_rx_complete_rsc(struct sk_buff *skb,
944 			       const struct gve_rx_compl_desc_dqo *desc,
945 			       struct gve_ptype ptype)
946 {
947 	struct skb_shared_info *shinfo = skb_shinfo(skb);
948 
949 	/* Only TCP is supported right now. */
950 	if (ptype.l4_type != GVE_L4_TYPE_TCP)
951 		return -EINVAL;
952 
953 	switch (ptype.l3_type) {
954 	case GVE_L3_TYPE_IPV4:
955 		shinfo->gso_type = SKB_GSO_TCPV4;
956 		break;
957 	case GVE_L3_TYPE_IPV6:
958 		shinfo->gso_type = SKB_GSO_TCPV6;
959 		break;
960 	default:
961 		return -EINVAL;
962 	}
963 
964 	shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len);
965 	return 0;
966 }
967 
968 /* Returns 0 if skb is completed successfully, -1 otherwise. */
969 static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi,
970 			       const struct gve_rx_compl_desc_dqo *desc,
971 			       netdev_features_t feat)
972 {
973 	struct gve_ptype ptype =
974 		rx->gve->ptype_lut_dqo->ptypes[desc->packet_type];
975 	int err;
976 
977 	skb_record_rx_queue(rx->ctx.skb_head, rx->q_num);
978 
979 	if (feat & NETIF_F_RXHASH)
980 		gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype);
981 
982 	if (feat & NETIF_F_RXCSUM)
983 		gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype);
984 
985 	if (rx->gve->ts_config.rx_filter == HWTSTAMP_FILTER_ALL)
986 		gve_rx_skb_hwtstamp(rx, desc);
987 
988 	/* RSC packets must set gso_size otherwise the TCP stack will complain
989 	 * that packets are larger than MTU.
990 	 */
991 	if (desc->rsc) {
992 		err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype);
993 		if (err < 0)
994 			return err;
995 	}
996 
997 	if (skb_headlen(rx->ctx.skb_head) == 0)
998 		napi_gro_frags(napi);
999 	else
1000 		napi_gro_receive(napi, rx->ctx.skb_head);
1001 
1002 	return 0;
1003 }
1004 
1005 int gve_rx_poll_dqo(struct gve_notify_block *block, int budget)
1006 {
1007 	struct gve_rx_compl_queue_dqo *complq;
1008 	struct napi_struct *napi;
1009 	netdev_features_t feat;
1010 	struct gve_rx_ring *rx;
1011 	struct gve_priv *priv;
1012 	u64 xdp_redirects;
1013 	u32 work_done = 0;
1014 	u64 bytes = 0;
1015 	u64 xdp_txs;
1016 	int err;
1017 
1018 	napi = &block->napi;
1019 	feat = napi->dev->features;
1020 
1021 	rx = block->rx;
1022 	priv = rx->gve;
1023 	complq = &rx->dqo.complq;
1024 
1025 	xdp_redirects = rx->xdp_actions[XDP_REDIRECT];
1026 	xdp_txs = rx->xdp_actions[XDP_TX];
1027 
1028 	while (work_done < budget) {
1029 		struct gve_rx_compl_desc_dqo *compl_desc =
1030 			&complq->desc_ring[complq->head];
1031 		u32 pkt_bytes;
1032 
1033 		/* No more new packets */
1034 		if (compl_desc->generation == complq->cur_gen_bit)
1035 			break;
1036 
1037 		/* Prefetch the next two descriptors. */
1038 		prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]);
1039 		prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]);
1040 
1041 		/* Do not read data until we own the descriptor */
1042 		dma_rmb();
1043 
1044 		err = gve_rx_dqo(napi, rx, compl_desc, complq->head, rx->q_num);
1045 		if (err < 0) {
1046 			gve_rx_free_skb(napi, rx);
1047 			u64_stats_update_begin(&rx->statss);
1048 			if (err == -ENOMEM)
1049 				rx->rx_skb_alloc_fail++;
1050 			else if (err == -EINVAL)
1051 				rx->rx_desc_err_dropped_pkt++;
1052 			u64_stats_update_end(&rx->statss);
1053 		}
1054 
1055 		complq->head = (complq->head + 1) & complq->mask;
1056 		complq->num_free_slots++;
1057 
1058 		/* When the ring wraps, the generation bit is flipped. */
1059 		complq->cur_gen_bit ^= (complq->head == 0);
1060 
1061 		/* Receiving a completion means we have space to post another
1062 		 * buffer on the buffer queue.
1063 		 */
1064 		{
1065 			struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
1066 
1067 			bufq->head = (bufq->head + 1) & bufq->mask;
1068 		}
1069 
1070 		/* Free running counter of completed descriptors */
1071 		rx->cnt++;
1072 
1073 		if (!rx->ctx.skb_head)
1074 			continue;
1075 
1076 		if (!compl_desc->end_of_packet)
1077 			continue;
1078 
1079 		work_done++;
1080 		pkt_bytes = rx->ctx.skb_head->len;
1081 		/* The ethernet header (first ETH_HLEN bytes) is snipped off
1082 		 * by eth_type_trans.
1083 		 */
1084 		if (skb_headlen(rx->ctx.skb_head))
1085 			pkt_bytes += ETH_HLEN;
1086 
1087 		/* gve_rx_complete_skb() will consume skb if successful */
1088 		if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) {
1089 			gve_rx_free_skb(napi, rx);
1090 			u64_stats_update_begin(&rx->statss);
1091 			rx->rx_desc_err_dropped_pkt++;
1092 			u64_stats_update_end(&rx->statss);
1093 			continue;
1094 		}
1095 
1096 		bytes += pkt_bytes;
1097 		rx->ctx.skb_head = NULL;
1098 		rx->ctx.skb_tail = NULL;
1099 	}
1100 
1101 	if (xdp_txs != rx->xdp_actions[XDP_TX])
1102 		gve_xdp_tx_flush_dqo(priv, rx->q_num);
1103 
1104 	if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT])
1105 		xdp_do_flush();
1106 
1107 	gve_rx_post_buffers_dqo(rx);
1108 
1109 	u64_stats_update_begin(&rx->statss);
1110 	rx->rpackets += work_done;
1111 	rx->rbytes += bytes;
1112 	u64_stats_update_end(&rx->statss);
1113 
1114 	return work_done;
1115 }
1116