xref: /linux/drivers/net/ethernet/google/gve/gve_rx_dqo.c (revision 23ca32e4ead48f68e37000f2552b973ef1439acb)
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6 
7 #include "gve.h"
8 #include "gve_dqo.h"
9 #include "gve_adminq.h"
10 #include "gve_utils.h"
11 #include <linux/bpf.h>
12 #include <linux/ip.h>
13 #include <linux/ipv6.h>
14 #include <linux/skbuff.h>
15 #include <linux/slab.h>
16 #include <net/ip6_checksum.h>
17 #include <net/ipv6.h>
18 #include <net/tcp.h>
19 
20 static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx)
21 {
22 	struct device *hdev = &priv->pdev->dev;
23 	int buf_count = rx->dqo.bufq.mask + 1;
24 
25 	if (rx->dqo.hdr_bufs.data) {
26 		dma_free_coherent(hdev, priv->header_buf_size * buf_count,
27 				  rx->dqo.hdr_bufs.data, rx->dqo.hdr_bufs.addr);
28 		rx->dqo.hdr_bufs.data = NULL;
29 	}
30 }
31 
32 static void gve_rx_init_ring_state_dqo(struct gve_rx_ring *rx,
33 				       const u32 buffer_queue_slots,
34 				       const u32 completion_queue_slots)
35 {
36 	int i;
37 
38 	/* Set buffer queue state */
39 	rx->dqo.bufq.mask = buffer_queue_slots - 1;
40 	rx->dqo.bufq.head = 0;
41 	rx->dqo.bufq.tail = 0;
42 
43 	/* Set completion queue state */
44 	rx->dqo.complq.num_free_slots = completion_queue_slots;
45 	rx->dqo.complq.mask = completion_queue_slots - 1;
46 	rx->dqo.complq.cur_gen_bit = 0;
47 	rx->dqo.complq.head = 0;
48 
49 	/* Set RX SKB context */
50 	rx->ctx.skb_head = NULL;
51 	rx->ctx.skb_tail = NULL;
52 
53 	/* Set up linked list of buffer IDs */
54 	if (rx->dqo.buf_states) {
55 		for (i = 0; i < rx->dqo.num_buf_states - 1; i++)
56 			rx->dqo.buf_states[i].next = i + 1;
57 		rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1;
58 	}
59 
60 	rx->dqo.free_buf_states = 0;
61 	rx->dqo.recycled_buf_states.head = -1;
62 	rx->dqo.recycled_buf_states.tail = -1;
63 	rx->dqo.used_buf_states.head = -1;
64 	rx->dqo.used_buf_states.tail = -1;
65 }
66 
67 static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx)
68 {
69 	struct gve_rx_ring *rx = &priv->rx[idx];
70 	size_t size;
71 	int i;
72 
73 	const u32 buffer_queue_slots = priv->rx_desc_cnt;
74 	const u32 completion_queue_slots = priv->rx_desc_cnt;
75 
76 	/* Reset buffer queue */
77 	if (rx->dqo.bufq.desc_ring) {
78 		size = sizeof(rx->dqo.bufq.desc_ring[0]) *
79 			buffer_queue_slots;
80 		memset(rx->dqo.bufq.desc_ring, 0, size);
81 	}
82 
83 	/* Reset completion queue */
84 	if (rx->dqo.complq.desc_ring) {
85 		size = sizeof(rx->dqo.complq.desc_ring[0]) *
86 			completion_queue_slots;
87 		memset(rx->dqo.complq.desc_ring, 0, size);
88 	}
89 
90 	/* Reset q_resources */
91 	if (rx->q_resources)
92 		memset(rx->q_resources, 0, sizeof(*rx->q_resources));
93 
94 	/* Reset buf states */
95 	if (rx->dqo.buf_states) {
96 		for (i = 0; i < rx->dqo.num_buf_states; i++) {
97 			struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
98 
99 			if (rx->dqo.page_pool)
100 				gve_free_to_page_pool(rx, bs, false);
101 			else
102 				gve_free_qpl_page_dqo(bs);
103 		}
104 	}
105 
106 	gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
107 				   completion_queue_slots);
108 }
109 
110 void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx)
111 {
112 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
113 	struct gve_rx_ring *rx = &priv->rx[idx];
114 
115 	if (!gve_rx_was_added_to_block(priv, idx))
116 		return;
117 
118 	if (rx->dqo.page_pool)
119 		page_pool_disable_direct_recycling(rx->dqo.page_pool);
120 	gve_remove_napi(priv, ntfy_idx);
121 	gve_rx_remove_from_block(priv, idx);
122 	gve_rx_reset_ring_dqo(priv, idx);
123 }
124 
125 void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
126 			  struct gve_rx_alloc_rings_cfg *cfg)
127 {
128 	struct device *hdev = &priv->pdev->dev;
129 	size_t completion_queue_slots;
130 	size_t buffer_queue_slots;
131 	int idx = rx->q_num;
132 	size_t size;
133 	u32 qpl_id;
134 	int i;
135 
136 	completion_queue_slots = rx->dqo.complq.mask + 1;
137 	buffer_queue_slots = rx->dqo.bufq.mask + 1;
138 
139 	if (rx->q_resources) {
140 		dma_free_coherent(hdev, sizeof(*rx->q_resources),
141 				  rx->q_resources, rx->q_resources_bus);
142 		rx->q_resources = NULL;
143 	}
144 
145 	for (i = 0; i < rx->dqo.num_buf_states; i++) {
146 		struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
147 
148 		if (rx->dqo.page_pool)
149 			gve_free_to_page_pool(rx, bs, false);
150 		else
151 			gve_free_qpl_page_dqo(bs);
152 	}
153 
154 	if (rx->dqo.qpl) {
155 		qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
156 		gve_free_queue_page_list(priv, rx->dqo.qpl, qpl_id);
157 		rx->dqo.qpl = NULL;
158 	}
159 
160 	if (rx->dqo.bufq.desc_ring) {
161 		size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
162 		dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring,
163 				  rx->dqo.bufq.bus);
164 		rx->dqo.bufq.desc_ring = NULL;
165 	}
166 
167 	if (rx->dqo.complq.desc_ring) {
168 		size = sizeof(rx->dqo.complq.desc_ring[0]) *
169 			completion_queue_slots;
170 		dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring,
171 				  rx->dqo.complq.bus);
172 		rx->dqo.complq.desc_ring = NULL;
173 	}
174 
175 	kvfree(rx->dqo.buf_states);
176 	rx->dqo.buf_states = NULL;
177 
178 	if (rx->dqo.page_pool) {
179 		page_pool_destroy(rx->dqo.page_pool);
180 		rx->dqo.page_pool = NULL;
181 	}
182 
183 	gve_rx_free_hdr_bufs(priv, rx);
184 
185 	netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
186 }
187 
188 static int gve_rx_alloc_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx,
189 				 const u32 buf_count)
190 {
191 	struct device *hdev = &priv->pdev->dev;
192 
193 	rx->dqo.hdr_bufs.data = dma_alloc_coherent(hdev, priv->header_buf_size * buf_count,
194 						   &rx->dqo.hdr_bufs.addr, GFP_KERNEL);
195 	if (!rx->dqo.hdr_bufs.data)
196 		return -ENOMEM;
197 
198 	return 0;
199 }
200 
201 void gve_rx_start_ring_dqo(struct gve_priv *priv, int idx)
202 {
203 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
204 
205 	gve_rx_add_to_block(priv, idx);
206 	gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo);
207 }
208 
209 int gve_rx_alloc_ring_dqo(struct gve_priv *priv,
210 			  struct gve_rx_alloc_rings_cfg *cfg,
211 			  struct gve_rx_ring *rx,
212 			  int idx)
213 {
214 	struct device *hdev = &priv->pdev->dev;
215 	struct page_pool *pool;
216 	int qpl_page_cnt;
217 	size_t size;
218 	u32 qpl_id;
219 
220 	const u32 buffer_queue_slots = cfg->ring_size;
221 	const u32 completion_queue_slots = cfg->ring_size;
222 
223 	netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n");
224 
225 	memset(rx, 0, sizeof(*rx));
226 	rx->gve = priv;
227 	rx->q_num = idx;
228 	rx->packet_buffer_size = cfg->packet_buffer_size;
229 
230 	if (cfg->xdp) {
231 		rx->packet_buffer_truesize = GVE_XDP_RX_BUFFER_SIZE_DQO;
232 		rx->rx_headroom = XDP_PACKET_HEADROOM;
233 	} else {
234 		rx->packet_buffer_truesize = rx->packet_buffer_size;
235 		rx->rx_headroom = 0;
236 	}
237 
238 	rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots :
239 		gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
240 	rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states,
241 				      sizeof(rx->dqo.buf_states[0]),
242 				      GFP_KERNEL);
243 	if (!rx->dqo.buf_states)
244 		return -ENOMEM;
245 
246 	/* Allocate header buffers for header-split */
247 	if (cfg->enable_header_split)
248 		if (gve_rx_alloc_hdr_bufs(priv, rx, buffer_queue_slots))
249 			goto err;
250 
251 	/* Allocate RX completion queue */
252 	size = sizeof(rx->dqo.complq.desc_ring[0]) *
253 		completion_queue_slots;
254 	rx->dqo.complq.desc_ring =
255 		dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL);
256 	if (!rx->dqo.complq.desc_ring)
257 		goto err;
258 
259 	/* Allocate RX buffer queue */
260 	size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
261 	rx->dqo.bufq.desc_ring =
262 		dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL);
263 	if (!rx->dqo.bufq.desc_ring)
264 		goto err;
265 
266 	if (cfg->raw_addressing) {
267 		pool = gve_rx_create_page_pool(priv, rx, cfg->xdp);
268 		if (IS_ERR(pool))
269 			goto err;
270 
271 		rx->dqo.page_pool = pool;
272 	} else {
273 		qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
274 		qpl_page_cnt = gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
275 
276 		rx->dqo.qpl = gve_alloc_queue_page_list(priv, qpl_id,
277 							qpl_page_cnt);
278 		if (!rx->dqo.qpl)
279 			goto err;
280 		rx->dqo.next_qpl_page_idx = 0;
281 	}
282 
283 	rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources),
284 					     &rx->q_resources_bus, GFP_KERNEL);
285 	if (!rx->q_resources)
286 		goto err;
287 
288 	gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
289 				   completion_queue_slots);
290 
291 	return 0;
292 
293 err:
294 	gve_rx_free_ring_dqo(priv, rx, cfg);
295 	return -ENOMEM;
296 }
297 
298 void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx)
299 {
300 	const struct gve_rx_ring *rx = &priv->rx[queue_idx];
301 	u64 index = be32_to_cpu(rx->q_resources->db_index);
302 
303 	iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]);
304 }
305 
306 int gve_rx_alloc_rings_dqo(struct gve_priv *priv,
307 			   struct gve_rx_alloc_rings_cfg *cfg)
308 {
309 	struct gve_rx_ring *rx;
310 	int err;
311 	int i;
312 
313 	rx = kvcalloc(cfg->qcfg_rx->max_queues, sizeof(struct gve_rx_ring),
314 		      GFP_KERNEL);
315 	if (!rx)
316 		return -ENOMEM;
317 
318 	for (i = 0; i < cfg->qcfg_rx->num_queues; i++) {
319 		err = gve_rx_alloc_ring_dqo(priv, cfg, &rx[i], i);
320 		if (err) {
321 			netif_err(priv, drv, priv->dev,
322 				  "Failed to alloc rx ring=%d: err=%d\n",
323 				  i, err);
324 			goto err;
325 		}
326 	}
327 
328 	cfg->rx = rx;
329 	return 0;
330 
331 err:
332 	for (i--; i >= 0; i--)
333 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
334 	kvfree(rx);
335 	return err;
336 }
337 
338 void gve_rx_free_rings_dqo(struct gve_priv *priv,
339 			   struct gve_rx_alloc_rings_cfg *cfg)
340 {
341 	struct gve_rx_ring *rx = cfg->rx;
342 	int i;
343 
344 	if (!rx)
345 		return;
346 
347 	for (i = 0; i < cfg->qcfg_rx->num_queues;  i++)
348 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
349 
350 	kvfree(rx);
351 	cfg->rx = NULL;
352 }
353 
354 void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx)
355 {
356 	struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
357 	struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
358 	struct gve_priv *priv = rx->gve;
359 	u32 num_avail_slots;
360 	u32 num_full_slots;
361 	u32 num_posted = 0;
362 
363 	num_full_slots = (bufq->tail - bufq->head) & bufq->mask;
364 	num_avail_slots = bufq->mask - num_full_slots;
365 
366 	num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots);
367 	while (num_posted < num_avail_slots) {
368 		struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail];
369 
370 		if (unlikely(gve_alloc_buffer(rx, desc))) {
371 			u64_stats_update_begin(&rx->statss);
372 			rx->rx_buf_alloc_fail++;
373 			u64_stats_update_end(&rx->statss);
374 			break;
375 		}
376 
377 		if (rx->dqo.hdr_bufs.data)
378 			desc->header_buf_addr =
379 				cpu_to_le64(rx->dqo.hdr_bufs.addr +
380 					    priv->header_buf_size * bufq->tail);
381 
382 		bufq->tail = (bufq->tail + 1) & bufq->mask;
383 		complq->num_free_slots--;
384 		num_posted++;
385 
386 		if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0)
387 			gve_rx_write_doorbell_dqo(priv, rx->q_num);
388 	}
389 
390 	rx->fill_cnt += num_posted;
391 }
392 
393 static void gve_rx_skb_csum(struct sk_buff *skb,
394 			    const struct gve_rx_compl_desc_dqo *desc,
395 			    struct gve_ptype ptype)
396 {
397 	skb->ip_summed = CHECKSUM_NONE;
398 
399 	/* HW did not identify and process L3 and L4 headers. */
400 	if (unlikely(!desc->l3_l4_processed))
401 		return;
402 
403 	if (ptype.l3_type == GVE_L3_TYPE_IPV4) {
404 		if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err))
405 			return;
406 	} else if (ptype.l3_type == GVE_L3_TYPE_IPV6) {
407 		/* Checksum should be skipped if this flag is set. */
408 		if (unlikely(desc->ipv6_ex_add))
409 			return;
410 	}
411 
412 	if (unlikely(desc->csum_l4_err))
413 		return;
414 
415 	switch (ptype.l4_type) {
416 	case GVE_L4_TYPE_TCP:
417 	case GVE_L4_TYPE_UDP:
418 	case GVE_L4_TYPE_ICMP:
419 	case GVE_L4_TYPE_SCTP:
420 		skb->ip_summed = CHECKSUM_UNNECESSARY;
421 		break;
422 	default:
423 		break;
424 	}
425 }
426 
427 static void gve_rx_skb_hash(struct sk_buff *skb,
428 			    const struct gve_rx_compl_desc_dqo *compl_desc,
429 			    struct gve_ptype ptype)
430 {
431 	enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2;
432 
433 	if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN)
434 		hash_type = PKT_HASH_TYPE_L4;
435 	else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN)
436 		hash_type = PKT_HASH_TYPE_L3;
437 
438 	skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type);
439 }
440 
441 /* Expand the hardware timestamp to the full 64 bits of width, and add it to the
442  * skb.
443  *
444  * This algorithm works by using the passed hardware timestamp to generate a
445  * diff relative to the last read of the nic clock. This diff can be positive or
446  * negative, as it is possible that we have read the clock more recently than
447  * the hardware has received this packet. To detect this, we use the high bit of
448  * the diff, and assume that the read is more recent if the high bit is set. In
449  * this case we invert the process.
450  *
451  * Note that this means if the time delta between packet reception and the last
452  * clock read is greater than ~2 seconds, this will provide invalid results.
453  */
454 static void gve_rx_skb_hwtstamp(struct gve_rx_ring *rx, u32 hwts)
455 {
456 	u64 last_read = READ_ONCE(rx->gve->last_sync_nic_counter);
457 	struct sk_buff *skb = rx->ctx.skb_head;
458 	u32 low = (u32)last_read;
459 	s32 diff = hwts - low;
460 
461 	skb_hwtstamps(skb)->hwtstamp = ns_to_ktime(last_read + diff);
462 }
463 
464 static void gve_rx_free_skb(struct napi_struct *napi, struct gve_rx_ring *rx)
465 {
466 	if (!rx->ctx.skb_head)
467 		return;
468 
469 	if (rx->ctx.skb_head == napi->skb)
470 		napi->skb = NULL;
471 	dev_kfree_skb_any(rx->ctx.skb_head);
472 	rx->ctx.skb_head = NULL;
473 	rx->ctx.skb_tail = NULL;
474 }
475 
476 static bool gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring *rx)
477 {
478 	if (!rx->dqo.qpl)
479 		return false;
480 	if (rx->dqo.used_buf_states_cnt <
481 		     (rx->dqo.num_buf_states -
482 		     GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD))
483 		return false;
484 	return true;
485 }
486 
487 static int gve_rx_copy_ondemand(struct gve_rx_ring *rx,
488 				struct gve_rx_buf_state_dqo *buf_state,
489 				u16 buf_len)
490 {
491 	struct page *page = alloc_page(GFP_ATOMIC);
492 	int num_frags;
493 
494 	if (!page)
495 		return -ENOMEM;
496 
497 	memcpy(page_address(page),
498 	       buf_state->page_info.page_address +
499 	       buf_state->page_info.page_offset,
500 	       buf_len);
501 	num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
502 	skb_add_rx_frag(rx->ctx.skb_tail, num_frags, page,
503 			0, buf_len, PAGE_SIZE);
504 
505 	u64_stats_update_begin(&rx->statss);
506 	rx->rx_frag_alloc_cnt++;
507 	u64_stats_update_end(&rx->statss);
508 	/* Return unused buffer. */
509 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
510 	return 0;
511 }
512 
513 static void gve_skb_add_rx_frag(struct gve_rx_ring *rx,
514 				struct gve_rx_buf_state_dqo *buf_state,
515 				int num_frags, u16 buf_len)
516 {
517 	if (rx->dqo.page_pool) {
518 		skb_add_rx_frag_netmem(rx->ctx.skb_tail, num_frags,
519 				       buf_state->page_info.netmem,
520 				       buf_state->page_info.page_offset +
521 				       buf_state->page_info.pad, buf_len,
522 				       buf_state->page_info.buf_size);
523 	} else {
524 		skb_add_rx_frag(rx->ctx.skb_tail, num_frags,
525 				buf_state->page_info.page,
526 				buf_state->page_info.page_offset +
527 				buf_state->page_info.pad, buf_len,
528 				buf_state->page_info.buf_size);
529 	}
530 }
531 
532 /* Chains multi skbs for single rx packet.
533  * Returns 0 if buffer is appended, -1 otherwise.
534  */
535 static int gve_rx_append_frags(struct napi_struct *napi,
536 			       struct gve_rx_buf_state_dqo *buf_state,
537 			       u16 buf_len, struct gve_rx_ring *rx,
538 			       struct gve_priv *priv)
539 {
540 	int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
541 
542 	if (unlikely(num_frags == MAX_SKB_FRAGS)) {
543 		struct sk_buff *skb;
544 
545 		skb = napi_alloc_skb(napi, 0);
546 		if (!skb)
547 			return -1;
548 
549 		if (rx->dqo.page_pool)
550 			skb_mark_for_recycle(skb);
551 
552 		if (rx->ctx.skb_tail == rx->ctx.skb_head)
553 			skb_shinfo(rx->ctx.skb_head)->frag_list = skb;
554 		else
555 			rx->ctx.skb_tail->next = skb;
556 		rx->ctx.skb_tail = skb;
557 		num_frags = 0;
558 	}
559 	if (rx->ctx.skb_tail != rx->ctx.skb_head) {
560 		rx->ctx.skb_head->len += buf_len;
561 		rx->ctx.skb_head->data_len += buf_len;
562 		rx->ctx.skb_head->truesize += buf_state->page_info.buf_size;
563 	}
564 
565 	/* Trigger ondemand page allocation if we are running low on buffers */
566 	if (gve_rx_should_trigger_copy_ondemand(rx))
567 		return gve_rx_copy_ondemand(rx, buf_state, buf_len);
568 
569 	gve_skb_add_rx_frag(rx, buf_state, num_frags, buf_len);
570 	gve_reuse_buffer(rx, buf_state);
571 	return 0;
572 }
573 
574 static int gve_xdp_tx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
575 			  struct xdp_buff *xdp)
576 {
577 	struct gve_tx_ring *tx;
578 	struct xdp_frame *xdpf;
579 	u32 tx_qid;
580 	int err;
581 
582 	xdpf = xdp_convert_buff_to_frame(xdp);
583 	if (unlikely(!xdpf))
584 		return -ENOSPC;
585 
586 	tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num);
587 	tx = &priv->tx[tx_qid];
588 	spin_lock(&tx->dqo_tx.xdp_lock);
589 	err = gve_xdp_xmit_one_dqo(priv, tx, xdpf);
590 	spin_unlock(&tx->dqo_tx.xdp_lock);
591 
592 	return err;
593 }
594 
595 static void gve_xdp_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
596 			     struct xdp_buff *xdp, struct bpf_prog *xprog,
597 			     int xdp_act,
598 			     struct gve_rx_buf_state_dqo *buf_state)
599 {
600 	int err;
601 	switch (xdp_act) {
602 	case XDP_ABORTED:
603 	case XDP_DROP:
604 	default:
605 		gve_free_buffer(rx, buf_state);
606 		break;
607 	case XDP_TX:
608 		err = gve_xdp_tx_dqo(priv, rx, xdp);
609 		if (unlikely(err))
610 			goto err;
611 		gve_reuse_buffer(rx, buf_state);
612 		break;
613 	case XDP_REDIRECT:
614 		err = xdp_do_redirect(priv->dev, xdp, xprog);
615 		if (unlikely(err))
616 			goto err;
617 		gve_reuse_buffer(rx, buf_state);
618 		break;
619 	}
620 	u64_stats_update_begin(&rx->statss);
621 	if ((u32)xdp_act < GVE_XDP_ACTIONS)
622 		rx->xdp_actions[xdp_act]++;
623 	u64_stats_update_end(&rx->statss);
624 	return;
625 err:
626 	u64_stats_update_begin(&rx->statss);
627 	if (xdp_act == XDP_TX)
628 		rx->xdp_tx_errors++;
629 	else if (xdp_act == XDP_REDIRECT)
630 		rx->xdp_redirect_errors++;
631 	u64_stats_update_end(&rx->statss);
632 	gve_free_buffer(rx, buf_state);
633 	return;
634 }
635 
636 /* Returns 0 if descriptor is completed successfully.
637  * Returns -EINVAL if descriptor is invalid.
638  * Returns -ENOMEM if data cannot be copied to skb.
639  */
640 static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
641 		      const struct gve_rx_compl_desc_dqo *compl_desc,
642 		      u32 desc_idx, int queue_idx)
643 {
644 	const u16 buffer_id = le16_to_cpu(compl_desc->buf_id);
645 	const bool hbo = compl_desc->header_buffer_overflow;
646 	const bool eop = compl_desc->end_of_packet != 0;
647 	const bool hsplit = compl_desc->split_header;
648 	struct gve_rx_buf_state_dqo *buf_state;
649 	struct gve_priv *priv = rx->gve;
650 	struct bpf_prog *xprog;
651 	u16 buf_len;
652 	u16 hdr_len;
653 
654 	if (unlikely(buffer_id >= rx->dqo.num_buf_states)) {
655 		net_err_ratelimited("%s: Invalid RX buffer_id=%u\n",
656 				    priv->dev->name, buffer_id);
657 		return -EINVAL;
658 	}
659 	buf_state = &rx->dqo.buf_states[buffer_id];
660 	if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) {
661 		net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n",
662 				    priv->dev->name, buffer_id);
663 		return -EINVAL;
664 	}
665 
666 	if (unlikely(compl_desc->rx_error)) {
667 		gve_free_buffer(rx, buf_state);
668 		return -EINVAL;
669 	}
670 
671 	buf_len = compl_desc->packet_len;
672 	hdr_len = compl_desc->header_len;
673 
674 	/* Page might have not been used for awhile and was likely last written
675 	 * by a different thread.
676 	 */
677 	if (rx->dqo.page_pool) {
678 		if (!netmem_is_net_iov(buf_state->page_info.netmem))
679 			prefetch(netmem_to_page(buf_state->page_info.netmem));
680 	} else {
681 		prefetch(buf_state->page_info.page);
682 	}
683 
684 	/* Copy the header into the skb in the case of header split */
685 	if (hsplit) {
686 		int unsplit = 0;
687 
688 		if (hdr_len && !hbo) {
689 			rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
690 							    rx->dqo.hdr_bufs.data +
691 							    desc_idx * priv->header_buf_size,
692 							    hdr_len);
693 			if (unlikely(!rx->ctx.skb_head))
694 				goto error;
695 			rx->ctx.skb_tail = rx->ctx.skb_head;
696 
697 			if (rx->dqo.page_pool)
698 				skb_mark_for_recycle(rx->ctx.skb_head);
699 		} else {
700 			unsplit = 1;
701 		}
702 		u64_stats_update_begin(&rx->statss);
703 		rx->rx_hsplit_pkt++;
704 		rx->rx_hsplit_unsplit_pkt += unsplit;
705 		rx->rx_hsplit_bytes += hdr_len;
706 		u64_stats_update_end(&rx->statss);
707 	}
708 
709 	/* Sync the portion of dma buffer for CPU to read. */
710 	dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr,
711 				      buf_state->page_info.page_offset +
712 				      buf_state->page_info.pad,
713 				      buf_len, DMA_FROM_DEVICE);
714 
715 	/* Append to current skb if one exists. */
716 	if (rx->ctx.skb_head) {
717 		if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx,
718 						 priv)) != 0) {
719 			goto error;
720 		}
721 		return 0;
722 	}
723 
724 	xprog = READ_ONCE(priv->xdp_prog);
725 	if (xprog) {
726 		struct xdp_buff xdp;
727 		void *old_data;
728 		int xdp_act;
729 
730 		xdp_init_buff(&xdp, buf_state->page_info.buf_size,
731 			      &rx->xdp_rxq);
732 		xdp_prepare_buff(&xdp,
733 				 buf_state->page_info.page_address +
734 				 buf_state->page_info.page_offset,
735 				 buf_state->page_info.pad,
736 				 buf_len, false);
737 		old_data = xdp.data;
738 		xdp_act = bpf_prog_run_xdp(xprog, &xdp);
739 		buf_state->page_info.pad += xdp.data - old_data;
740 		buf_len = xdp.data_end - xdp.data;
741 		if (xdp_act != XDP_PASS) {
742 			gve_xdp_done_dqo(priv, rx, &xdp, xprog, xdp_act,
743 					 buf_state);
744 			return 0;
745 		}
746 
747 		u64_stats_update_begin(&rx->statss);
748 		rx->xdp_actions[XDP_PASS]++;
749 		u64_stats_update_end(&rx->statss);
750 	}
751 
752 	if (eop && buf_len <= priv->rx_copybreak) {
753 		rx->ctx.skb_head = gve_rx_copy(priv->dev, napi,
754 					       &buf_state->page_info, buf_len);
755 		if (unlikely(!rx->ctx.skb_head))
756 			goto error;
757 		rx->ctx.skb_tail = rx->ctx.skb_head;
758 
759 		u64_stats_update_begin(&rx->statss);
760 		rx->rx_copied_pkt++;
761 		rx->rx_copybreak_pkt++;
762 		u64_stats_update_end(&rx->statss);
763 
764 		gve_free_buffer(rx, buf_state);
765 		return 0;
766 	}
767 
768 	rx->ctx.skb_head = napi_get_frags(napi);
769 	if (unlikely(!rx->ctx.skb_head))
770 		goto error;
771 	rx->ctx.skb_tail = rx->ctx.skb_head;
772 
773 	if (gve_rx_should_trigger_copy_ondemand(rx)) {
774 		if (gve_rx_copy_ondemand(rx, buf_state, buf_len) < 0)
775 			goto error;
776 		return 0;
777 	}
778 
779 	if (rx->dqo.page_pool)
780 		skb_mark_for_recycle(rx->ctx.skb_head);
781 
782 	gve_skb_add_rx_frag(rx, buf_state, 0, buf_len);
783 	gve_reuse_buffer(rx, buf_state);
784 	return 0;
785 
786 error:
787 	gve_free_buffer(rx, buf_state);
788 	return -ENOMEM;
789 }
790 
791 static int gve_rx_complete_rsc(struct sk_buff *skb,
792 			       const struct gve_rx_compl_desc_dqo *desc,
793 			       struct gve_ptype ptype)
794 {
795 	struct skb_shared_info *shinfo = skb_shinfo(skb);
796 
797 	/* Only TCP is supported right now. */
798 	if (ptype.l4_type != GVE_L4_TYPE_TCP)
799 		return -EINVAL;
800 
801 	switch (ptype.l3_type) {
802 	case GVE_L3_TYPE_IPV4:
803 		shinfo->gso_type = SKB_GSO_TCPV4;
804 		break;
805 	case GVE_L3_TYPE_IPV6:
806 		shinfo->gso_type = SKB_GSO_TCPV6;
807 		break;
808 	default:
809 		return -EINVAL;
810 	}
811 
812 	shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len);
813 	return 0;
814 }
815 
816 /* Returns 0 if skb is completed successfully, -1 otherwise. */
817 static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi,
818 			       const struct gve_rx_compl_desc_dqo *desc,
819 			       netdev_features_t feat)
820 {
821 	struct gve_ptype ptype =
822 		rx->gve->ptype_lut_dqo->ptypes[desc->packet_type];
823 	int err;
824 
825 	skb_record_rx_queue(rx->ctx.skb_head, rx->q_num);
826 
827 	if (feat & NETIF_F_RXHASH)
828 		gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype);
829 
830 	if (feat & NETIF_F_RXCSUM)
831 		gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype);
832 
833 	if (rx->gve->ts_config.rx_filter == HWTSTAMP_FILTER_ALL)
834 		gve_rx_skb_hwtstamp(rx, le32_to_cpu(desc->ts));
835 
836 	/* RSC packets must set gso_size otherwise the TCP stack will complain
837 	 * that packets are larger than MTU.
838 	 */
839 	if (desc->rsc) {
840 		err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype);
841 		if (err < 0)
842 			return err;
843 	}
844 
845 	if (skb_headlen(rx->ctx.skb_head) == 0)
846 		napi_gro_frags(napi);
847 	else
848 		napi_gro_receive(napi, rx->ctx.skb_head);
849 
850 	return 0;
851 }
852 
853 int gve_rx_poll_dqo(struct gve_notify_block *block, int budget)
854 {
855 	struct gve_rx_compl_queue_dqo *complq;
856 	struct napi_struct *napi;
857 	netdev_features_t feat;
858 	struct gve_rx_ring *rx;
859 	struct gve_priv *priv;
860 	u64 xdp_redirects;
861 	u32 work_done = 0;
862 	u64 bytes = 0;
863 	u64 xdp_txs;
864 	int err;
865 
866 	napi = &block->napi;
867 	feat = napi->dev->features;
868 
869 	rx = block->rx;
870 	priv = rx->gve;
871 	complq = &rx->dqo.complq;
872 
873 	xdp_redirects = rx->xdp_actions[XDP_REDIRECT];
874 	xdp_txs = rx->xdp_actions[XDP_TX];
875 
876 	while (work_done < budget) {
877 		struct gve_rx_compl_desc_dqo *compl_desc =
878 			&complq->desc_ring[complq->head];
879 		u32 pkt_bytes;
880 
881 		/* No more new packets */
882 		if (compl_desc->generation == complq->cur_gen_bit)
883 			break;
884 
885 		/* Prefetch the next two descriptors. */
886 		prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]);
887 		prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]);
888 
889 		/* Do not read data until we own the descriptor */
890 		dma_rmb();
891 
892 		err = gve_rx_dqo(napi, rx, compl_desc, complq->head, rx->q_num);
893 		if (err < 0) {
894 			gve_rx_free_skb(napi, rx);
895 			u64_stats_update_begin(&rx->statss);
896 			if (err == -ENOMEM)
897 				rx->rx_skb_alloc_fail++;
898 			else if (err == -EINVAL)
899 				rx->rx_desc_err_dropped_pkt++;
900 			u64_stats_update_end(&rx->statss);
901 		}
902 
903 		complq->head = (complq->head + 1) & complq->mask;
904 		complq->num_free_slots++;
905 
906 		/* When the ring wraps, the generation bit is flipped. */
907 		complq->cur_gen_bit ^= (complq->head == 0);
908 
909 		/* Receiving a completion means we have space to post another
910 		 * buffer on the buffer queue.
911 		 */
912 		{
913 			struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
914 
915 			bufq->head = (bufq->head + 1) & bufq->mask;
916 		}
917 
918 		/* Free running counter of completed descriptors */
919 		rx->cnt++;
920 
921 		if (!rx->ctx.skb_head)
922 			continue;
923 
924 		if (!compl_desc->end_of_packet)
925 			continue;
926 
927 		work_done++;
928 		pkt_bytes = rx->ctx.skb_head->len;
929 		/* The ethernet header (first ETH_HLEN bytes) is snipped off
930 		 * by eth_type_trans.
931 		 */
932 		if (skb_headlen(rx->ctx.skb_head))
933 			pkt_bytes += ETH_HLEN;
934 
935 		/* gve_rx_complete_skb() will consume skb if successful */
936 		if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) {
937 			gve_rx_free_skb(napi, rx);
938 			u64_stats_update_begin(&rx->statss);
939 			rx->rx_desc_err_dropped_pkt++;
940 			u64_stats_update_end(&rx->statss);
941 			continue;
942 		}
943 
944 		bytes += pkt_bytes;
945 		rx->ctx.skb_head = NULL;
946 		rx->ctx.skb_tail = NULL;
947 	}
948 
949 	if (xdp_txs != rx->xdp_actions[XDP_TX])
950 		gve_xdp_tx_flush_dqo(priv, rx->q_num);
951 
952 	if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT])
953 		xdp_do_flush();
954 
955 	gve_rx_post_buffers_dqo(rx);
956 
957 	u64_stats_update_begin(&rx->statss);
958 	rx->rpackets += work_done;
959 	rx->rbytes += bytes;
960 	u64_stats_update_end(&rx->statss);
961 
962 	return work_done;
963 }
964