xref: /linux/drivers/net/ethernet/google/gve/gve_rx_dqo.c (revision 59cb902371227c2cd7932a565eda97ac7e4707bf)
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6 
7 #include "gve.h"
8 #include "gve_dqo.h"
9 #include "gve_adminq.h"
10 #include "gve_utils.h"
11 #include <linux/bpf.h>
12 #include <linux/ip.h>
13 #include <linux/ipv6.h>
14 #include <linux/skbuff.h>
15 #include <linux/slab.h>
16 #include <net/ip6_checksum.h>
17 #include <net/ipv6.h>
18 #include <net/tcp.h>
19 #include <net/xdp_sock_drv.h>
20 
21 static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx)
22 {
23 	struct device *hdev = &priv->pdev->dev;
24 	int buf_count = rx->dqo.bufq.mask + 1;
25 
26 	if (rx->dqo.hdr_bufs.data) {
27 		dma_free_coherent(hdev, priv->header_buf_size * buf_count,
28 				  rx->dqo.hdr_bufs.data, rx->dqo.hdr_bufs.addr);
29 		rx->dqo.hdr_bufs.data = NULL;
30 	}
31 }
32 
33 static void gve_rx_init_ring_state_dqo(struct gve_rx_ring *rx,
34 				       const u32 buffer_queue_slots,
35 				       const u32 completion_queue_slots)
36 {
37 	int i;
38 
39 	/* Set buffer queue state */
40 	rx->dqo.bufq.mask = buffer_queue_slots - 1;
41 	rx->dqo.bufq.head = 0;
42 	rx->dqo.bufq.tail = 0;
43 
44 	/* Set completion queue state */
45 	rx->dqo.complq.num_free_slots = completion_queue_slots;
46 	rx->dqo.complq.mask = completion_queue_slots - 1;
47 	rx->dqo.complq.cur_gen_bit = 0;
48 	rx->dqo.complq.head = 0;
49 
50 	/* Set RX SKB context */
51 	rx->ctx.skb_head = NULL;
52 	rx->ctx.skb_tail = NULL;
53 
54 	/* Set up linked list of buffer IDs */
55 	if (rx->dqo.buf_states) {
56 		for (i = 0; i < rx->dqo.num_buf_states - 1; i++)
57 			rx->dqo.buf_states[i].next = i + 1;
58 		rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1;
59 	}
60 
61 	rx->dqo.free_buf_states = 0;
62 	rx->dqo.recycled_buf_states.head = -1;
63 	rx->dqo.recycled_buf_states.tail = -1;
64 	rx->dqo.used_buf_states.head = -1;
65 	rx->dqo.used_buf_states.tail = -1;
66 }
67 
68 static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx)
69 {
70 	struct gve_rx_ring *rx = &priv->rx[idx];
71 	size_t size;
72 	int i;
73 
74 	const u32 buffer_queue_slots = priv->rx_desc_cnt;
75 	const u32 completion_queue_slots = priv->rx_desc_cnt;
76 
77 	/* Reset buffer queue */
78 	if (rx->dqo.bufq.desc_ring) {
79 		size = sizeof(rx->dqo.bufq.desc_ring[0]) *
80 			buffer_queue_slots;
81 		memset(rx->dqo.bufq.desc_ring, 0, size);
82 	}
83 
84 	/* Reset completion queue */
85 	if (rx->dqo.complq.desc_ring) {
86 		size = sizeof(rx->dqo.complq.desc_ring[0]) *
87 			completion_queue_slots;
88 		memset(rx->dqo.complq.desc_ring, 0, size);
89 	}
90 
91 	/* Reset q_resources */
92 	if (rx->q_resources)
93 		memset(rx->q_resources, 0, sizeof(*rx->q_resources));
94 
95 	/* Reset buf states */
96 	if (rx->dqo.buf_states) {
97 		for (i = 0; i < rx->dqo.num_buf_states; i++) {
98 			struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
99 
100 			if (rx->dqo.page_pool)
101 				gve_free_to_page_pool(rx, bs, false);
102 			else
103 				gve_free_qpl_page_dqo(bs);
104 		}
105 	}
106 
107 	gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
108 				   completion_queue_slots);
109 }
110 
111 void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx)
112 {
113 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
114 	struct gve_rx_ring *rx = &priv->rx[idx];
115 
116 	if (!gve_rx_was_added_to_block(priv, idx))
117 		return;
118 
119 	if (rx->dqo.page_pool)
120 		page_pool_disable_direct_recycling(rx->dqo.page_pool);
121 	gve_remove_napi(priv, ntfy_idx);
122 	gve_rx_remove_from_block(priv, idx);
123 	gve_rx_reset_ring_dqo(priv, idx);
124 }
125 
126 void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
127 			  struct gve_rx_alloc_rings_cfg *cfg)
128 {
129 	struct device *hdev = &priv->pdev->dev;
130 	size_t completion_queue_slots;
131 	size_t buffer_queue_slots;
132 	int idx = rx->q_num;
133 	size_t size;
134 	u32 qpl_id;
135 	int i;
136 
137 	completion_queue_slots = rx->dqo.complq.mask + 1;
138 	buffer_queue_slots = rx->dqo.bufq.mask + 1;
139 
140 	if (rx->q_resources) {
141 		dma_free_coherent(hdev, sizeof(*rx->q_resources),
142 				  rx->q_resources, rx->q_resources_bus);
143 		rx->q_resources = NULL;
144 	}
145 
146 	for (i = 0; i < rx->dqo.num_buf_states; i++) {
147 		struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
148 
149 		if (rx->dqo.page_pool)
150 			gve_free_to_page_pool(rx, bs, false);
151 		else
152 			gve_free_qpl_page_dqo(bs);
153 		if (gve_buf_state_is_allocated(rx, bs) && bs->xsk_buff) {
154 			xsk_buff_free(bs->xsk_buff);
155 			bs->xsk_buff = NULL;
156 		}
157 	}
158 
159 	if (rx->dqo.qpl) {
160 		qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
161 		gve_free_queue_page_list(priv, rx->dqo.qpl, qpl_id);
162 		rx->dqo.qpl = NULL;
163 	}
164 
165 	if (rx->dqo.bufq.desc_ring) {
166 		size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
167 		dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring,
168 				  rx->dqo.bufq.bus);
169 		rx->dqo.bufq.desc_ring = NULL;
170 	}
171 
172 	if (rx->dqo.complq.desc_ring) {
173 		size = sizeof(rx->dqo.complq.desc_ring[0]) *
174 			completion_queue_slots;
175 		dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring,
176 				  rx->dqo.complq.bus);
177 		rx->dqo.complq.desc_ring = NULL;
178 	}
179 
180 	kvfree(rx->dqo.buf_states);
181 	rx->dqo.buf_states = NULL;
182 
183 	if (rx->dqo.page_pool) {
184 		page_pool_destroy(rx->dqo.page_pool);
185 		rx->dqo.page_pool = NULL;
186 	}
187 
188 	gve_rx_free_hdr_bufs(priv, rx);
189 
190 	netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
191 }
192 
193 static int gve_rx_alloc_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx,
194 				 const u32 buf_count)
195 {
196 	struct device *hdev = &priv->pdev->dev;
197 
198 	rx->dqo.hdr_bufs.data = dma_alloc_coherent(hdev, priv->header_buf_size * buf_count,
199 						   &rx->dqo.hdr_bufs.addr, GFP_KERNEL);
200 	if (!rx->dqo.hdr_bufs.data)
201 		return -ENOMEM;
202 
203 	return 0;
204 }
205 
206 void gve_rx_start_ring_dqo(struct gve_priv *priv, int idx)
207 {
208 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
209 
210 	gve_rx_add_to_block(priv, idx);
211 	gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo);
212 }
213 
214 int gve_rx_alloc_ring_dqo(struct gve_priv *priv,
215 			  struct gve_rx_alloc_rings_cfg *cfg,
216 			  struct gve_rx_ring *rx,
217 			  int idx)
218 {
219 	struct device *hdev = &priv->pdev->dev;
220 	struct page_pool *pool;
221 	int qpl_page_cnt;
222 	size_t size;
223 	u32 qpl_id;
224 
225 	const u32 buffer_queue_slots = cfg->ring_size;
226 	const u32 completion_queue_slots = cfg->ring_size;
227 
228 	netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n");
229 
230 	memset(rx, 0, sizeof(*rx));
231 	rx->gve = priv;
232 	rx->q_num = idx;
233 	rx->packet_buffer_size = cfg->packet_buffer_size;
234 
235 	if (cfg->xdp) {
236 		rx->packet_buffer_truesize = GVE_XDP_RX_BUFFER_SIZE_DQO;
237 		rx->rx_headroom = XDP_PACKET_HEADROOM;
238 	} else {
239 		rx->packet_buffer_truesize = rx->packet_buffer_size;
240 		rx->rx_headroom = 0;
241 	}
242 
243 	rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots :
244 		gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
245 	rx->dqo.buf_states = kvcalloc_node(rx->dqo.num_buf_states,
246 					   sizeof(rx->dqo.buf_states[0]),
247 					   GFP_KERNEL, priv->numa_node);
248 	if (!rx->dqo.buf_states)
249 		return -ENOMEM;
250 
251 	/* Allocate header buffers for header-split */
252 	if (cfg->enable_header_split)
253 		if (gve_rx_alloc_hdr_bufs(priv, rx, buffer_queue_slots))
254 			goto err;
255 
256 	/* Allocate RX completion queue */
257 	size = sizeof(rx->dqo.complq.desc_ring[0]) *
258 		completion_queue_slots;
259 	rx->dqo.complq.desc_ring =
260 		dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL);
261 	if (!rx->dqo.complq.desc_ring)
262 		goto err;
263 
264 	/* Allocate RX buffer queue */
265 	size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
266 	rx->dqo.bufq.desc_ring =
267 		dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL);
268 	if (!rx->dqo.bufq.desc_ring)
269 		goto err;
270 
271 	if (cfg->raw_addressing) {
272 		pool = gve_rx_create_page_pool(priv, rx, cfg->xdp);
273 		if (IS_ERR(pool))
274 			goto err;
275 
276 		rx->dqo.page_pool = pool;
277 	} else {
278 		qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
279 		qpl_page_cnt = gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
280 
281 		rx->dqo.qpl = gve_alloc_queue_page_list(priv, qpl_id,
282 							qpl_page_cnt);
283 		if (!rx->dqo.qpl)
284 			goto err;
285 		rx->dqo.next_qpl_page_idx = 0;
286 	}
287 
288 	rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources),
289 					     &rx->q_resources_bus, GFP_KERNEL);
290 	if (!rx->q_resources)
291 		goto err;
292 
293 	gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
294 				   completion_queue_slots);
295 
296 	return 0;
297 
298 err:
299 	gve_rx_free_ring_dqo(priv, rx, cfg);
300 	return -ENOMEM;
301 }
302 
303 void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx)
304 {
305 	const struct gve_rx_ring *rx = &priv->rx[queue_idx];
306 	u64 index = be32_to_cpu(rx->q_resources->db_index);
307 
308 	iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]);
309 }
310 
311 int gve_rx_alloc_rings_dqo(struct gve_priv *priv,
312 			   struct gve_rx_alloc_rings_cfg *cfg)
313 {
314 	struct gve_rx_ring *rx;
315 	int err;
316 	int i;
317 
318 	rx = kvcalloc(cfg->qcfg_rx->max_queues, sizeof(struct gve_rx_ring),
319 		      GFP_KERNEL);
320 	if (!rx)
321 		return -ENOMEM;
322 
323 	for (i = 0; i < cfg->qcfg_rx->num_queues; i++) {
324 		err = gve_rx_alloc_ring_dqo(priv, cfg, &rx[i], i);
325 		if (err) {
326 			netif_err(priv, drv, priv->dev,
327 				  "Failed to alloc rx ring=%d: err=%d\n",
328 				  i, err);
329 			goto err;
330 		}
331 	}
332 
333 	cfg->rx = rx;
334 	return 0;
335 
336 err:
337 	for (i--; i >= 0; i--)
338 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
339 	kvfree(rx);
340 	return err;
341 }
342 
343 void gve_rx_free_rings_dqo(struct gve_priv *priv,
344 			   struct gve_rx_alloc_rings_cfg *cfg)
345 {
346 	struct gve_rx_ring *rx = cfg->rx;
347 	int i;
348 
349 	if (!rx)
350 		return;
351 
352 	for (i = 0; i < cfg->qcfg_rx->num_queues;  i++)
353 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
354 
355 	kvfree(rx);
356 	cfg->rx = NULL;
357 }
358 
359 void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx)
360 {
361 	struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
362 	struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
363 	struct gve_priv *priv = rx->gve;
364 	u32 num_avail_slots;
365 	u32 num_full_slots;
366 	u32 num_posted = 0;
367 
368 	num_full_slots = (bufq->tail - bufq->head) & bufq->mask;
369 	num_avail_slots = bufq->mask - num_full_slots;
370 
371 	num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots);
372 	while (num_posted < num_avail_slots) {
373 		struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail];
374 
375 		if (unlikely(gve_alloc_buffer(rx, desc))) {
376 			u64_stats_update_begin(&rx->statss);
377 			rx->rx_buf_alloc_fail++;
378 			u64_stats_update_end(&rx->statss);
379 			break;
380 		}
381 
382 		if (rx->dqo.hdr_bufs.data)
383 			desc->header_buf_addr =
384 				cpu_to_le64(rx->dqo.hdr_bufs.addr +
385 					    priv->header_buf_size * bufq->tail);
386 
387 		bufq->tail = (bufq->tail + 1) & bufq->mask;
388 		complq->num_free_slots--;
389 		num_posted++;
390 
391 		if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0)
392 			gve_rx_write_doorbell_dqo(priv, rx->q_num);
393 	}
394 
395 	rx->fill_cnt += num_posted;
396 }
397 
398 static void gve_rx_skb_csum(struct sk_buff *skb,
399 			    const struct gve_rx_compl_desc_dqo *desc,
400 			    struct gve_ptype ptype)
401 {
402 	skb->ip_summed = CHECKSUM_NONE;
403 
404 	/* HW did not identify and process L3 and L4 headers. */
405 	if (unlikely(!desc->l3_l4_processed))
406 		return;
407 
408 	if (ptype.l3_type == GVE_L3_TYPE_IPV4) {
409 		if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err))
410 			return;
411 	} else if (ptype.l3_type == GVE_L3_TYPE_IPV6) {
412 		/* Checksum should be skipped if this flag is set. */
413 		if (unlikely(desc->ipv6_ex_add))
414 			return;
415 	}
416 
417 	if (unlikely(desc->csum_l4_err))
418 		return;
419 
420 	switch (ptype.l4_type) {
421 	case GVE_L4_TYPE_TCP:
422 	case GVE_L4_TYPE_UDP:
423 	case GVE_L4_TYPE_ICMP:
424 	case GVE_L4_TYPE_SCTP:
425 		skb->ip_summed = CHECKSUM_UNNECESSARY;
426 		break;
427 	default:
428 		break;
429 	}
430 }
431 
432 static void gve_rx_skb_hash(struct sk_buff *skb,
433 			    const struct gve_rx_compl_desc_dqo *compl_desc,
434 			    struct gve_ptype ptype)
435 {
436 	enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2;
437 
438 	if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN)
439 		hash_type = PKT_HASH_TYPE_L4;
440 	else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN)
441 		hash_type = PKT_HASH_TYPE_L3;
442 
443 	skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type);
444 }
445 
446 /* Expand the hardware timestamp to the full 64 bits of width, and add it to the
447  * skb.
448  *
449  * This algorithm works by using the passed hardware timestamp to generate a
450  * diff relative to the last read of the nic clock. This diff can be positive or
451  * negative, as it is possible that we have read the clock more recently than
452  * the hardware has received this packet. To detect this, we use the high bit of
453  * the diff, and assume that the read is more recent if the high bit is set. In
454  * this case we invert the process.
455  *
456  * Note that this means if the time delta between packet reception and the last
457  * clock read is greater than ~2 seconds, this will provide invalid results.
458  */
459 static void gve_rx_skb_hwtstamp(struct gve_rx_ring *rx, u32 hwts)
460 {
461 	u64 last_read = READ_ONCE(rx->gve->last_sync_nic_counter);
462 	struct sk_buff *skb = rx->ctx.skb_head;
463 	u32 low = (u32)last_read;
464 	s32 diff = hwts - low;
465 
466 	skb_hwtstamps(skb)->hwtstamp = ns_to_ktime(last_read + diff);
467 }
468 
469 static void gve_rx_free_skb(struct napi_struct *napi, struct gve_rx_ring *rx)
470 {
471 	if (!rx->ctx.skb_head)
472 		return;
473 
474 	if (rx->ctx.skb_head == napi->skb)
475 		napi->skb = NULL;
476 	dev_kfree_skb_any(rx->ctx.skb_head);
477 	rx->ctx.skb_head = NULL;
478 	rx->ctx.skb_tail = NULL;
479 }
480 
481 static bool gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring *rx)
482 {
483 	if (!rx->dqo.qpl)
484 		return false;
485 	if (rx->dqo.used_buf_states_cnt <
486 		     (rx->dqo.num_buf_states -
487 		     GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD))
488 		return false;
489 	return true;
490 }
491 
492 static int gve_rx_copy_ondemand(struct gve_rx_ring *rx,
493 				struct gve_rx_buf_state_dqo *buf_state,
494 				u16 buf_len)
495 {
496 	struct page *page = alloc_pages_node(rx->gve->numa_node, GFP_ATOMIC, 0);
497 	int num_frags;
498 
499 	if (!page)
500 		return -ENOMEM;
501 
502 	memcpy(page_address(page),
503 	       buf_state->page_info.page_address +
504 	       buf_state->page_info.page_offset,
505 	       buf_len);
506 	num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
507 	skb_add_rx_frag(rx->ctx.skb_tail, num_frags, page,
508 			0, buf_len, PAGE_SIZE);
509 
510 	u64_stats_update_begin(&rx->statss);
511 	rx->rx_frag_alloc_cnt++;
512 	u64_stats_update_end(&rx->statss);
513 	/* Return unused buffer. */
514 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
515 	return 0;
516 }
517 
518 static void gve_skb_add_rx_frag(struct gve_rx_ring *rx,
519 				struct gve_rx_buf_state_dqo *buf_state,
520 				int num_frags, u16 buf_len)
521 {
522 	if (rx->dqo.page_pool) {
523 		skb_add_rx_frag_netmem(rx->ctx.skb_tail, num_frags,
524 				       buf_state->page_info.netmem,
525 				       buf_state->page_info.page_offset +
526 				       buf_state->page_info.pad, buf_len,
527 				       buf_state->page_info.buf_size);
528 	} else {
529 		skb_add_rx_frag(rx->ctx.skb_tail, num_frags,
530 				buf_state->page_info.page,
531 				buf_state->page_info.page_offset +
532 				buf_state->page_info.pad, buf_len,
533 				buf_state->page_info.buf_size);
534 	}
535 }
536 
537 /* Chains multi skbs for single rx packet.
538  * Returns 0 if buffer is appended, -1 otherwise.
539  */
540 static int gve_rx_append_frags(struct napi_struct *napi,
541 			       struct gve_rx_buf_state_dqo *buf_state,
542 			       u16 buf_len, struct gve_rx_ring *rx,
543 			       struct gve_priv *priv)
544 {
545 	int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
546 
547 	if (unlikely(num_frags == MAX_SKB_FRAGS)) {
548 		struct sk_buff *skb;
549 
550 		skb = napi_alloc_skb(napi, 0);
551 		if (!skb)
552 			return -1;
553 
554 		if (rx->dqo.page_pool)
555 			skb_mark_for_recycle(skb);
556 
557 		if (rx->ctx.skb_tail == rx->ctx.skb_head)
558 			skb_shinfo(rx->ctx.skb_head)->frag_list = skb;
559 		else
560 			rx->ctx.skb_tail->next = skb;
561 		rx->ctx.skb_tail = skb;
562 		num_frags = 0;
563 	}
564 	if (rx->ctx.skb_tail != rx->ctx.skb_head) {
565 		rx->ctx.skb_head->len += buf_len;
566 		rx->ctx.skb_head->data_len += buf_len;
567 		rx->ctx.skb_head->truesize += buf_state->page_info.buf_size;
568 	}
569 
570 	/* Trigger ondemand page allocation if we are running low on buffers */
571 	if (gve_rx_should_trigger_copy_ondemand(rx))
572 		return gve_rx_copy_ondemand(rx, buf_state, buf_len);
573 
574 	gve_skb_add_rx_frag(rx, buf_state, num_frags, buf_len);
575 	gve_reuse_buffer(rx, buf_state);
576 	return 0;
577 }
578 
579 static int gve_xdp_tx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
580 			  struct xdp_buff *xdp)
581 {
582 	struct gve_tx_ring *tx;
583 	struct xdp_frame *xdpf;
584 	u32 tx_qid;
585 	int err;
586 
587 	xdpf = xdp_convert_buff_to_frame(xdp);
588 	if (unlikely(!xdpf)) {
589 		if (rx->xsk_pool)
590 			xsk_buff_free(xdp);
591 		return -ENOSPC;
592 	}
593 
594 	tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num);
595 	tx = &priv->tx[tx_qid];
596 	spin_lock(&tx->dqo_tx.xdp_lock);
597 	err = gve_xdp_xmit_one_dqo(priv, tx, xdpf);
598 	spin_unlock(&tx->dqo_tx.xdp_lock);
599 
600 	return err;
601 }
602 
603 static void gve_xsk_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
604 			     struct xdp_buff *xdp, struct bpf_prog *xprog,
605 			     int xdp_act)
606 {
607 	switch (xdp_act) {
608 	case XDP_ABORTED:
609 	case XDP_DROP:
610 	default:
611 		xsk_buff_free(xdp);
612 		break;
613 	case XDP_TX:
614 		if (unlikely(gve_xdp_tx_dqo(priv, rx, xdp)))
615 			goto err;
616 		break;
617 	case XDP_REDIRECT:
618 		if (unlikely(xdp_do_redirect(priv->dev, xdp, xprog)))
619 			goto err;
620 		break;
621 	}
622 
623 	u64_stats_update_begin(&rx->statss);
624 	if ((u32)xdp_act < GVE_XDP_ACTIONS)
625 		rx->xdp_actions[xdp_act]++;
626 	u64_stats_update_end(&rx->statss);
627 	return;
628 
629 err:
630 	u64_stats_update_begin(&rx->statss);
631 	if (xdp_act == XDP_TX)
632 		rx->xdp_tx_errors++;
633 	if (xdp_act == XDP_REDIRECT)
634 		rx->xdp_redirect_errors++;
635 	u64_stats_update_end(&rx->statss);
636 }
637 
638 static void gve_xdp_done_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
639 			     struct xdp_buff *xdp, struct bpf_prog *xprog,
640 			     int xdp_act,
641 			     struct gve_rx_buf_state_dqo *buf_state)
642 {
643 	int err;
644 	switch (xdp_act) {
645 	case XDP_ABORTED:
646 	case XDP_DROP:
647 	default:
648 		gve_free_buffer(rx, buf_state);
649 		break;
650 	case XDP_TX:
651 		err = gve_xdp_tx_dqo(priv, rx, xdp);
652 		if (unlikely(err))
653 			goto err;
654 		gve_reuse_buffer(rx, buf_state);
655 		break;
656 	case XDP_REDIRECT:
657 		err = xdp_do_redirect(priv->dev, xdp, xprog);
658 		if (unlikely(err))
659 			goto err;
660 		gve_reuse_buffer(rx, buf_state);
661 		break;
662 	}
663 	u64_stats_update_begin(&rx->statss);
664 	if ((u32)xdp_act < GVE_XDP_ACTIONS)
665 		rx->xdp_actions[xdp_act]++;
666 	u64_stats_update_end(&rx->statss);
667 	return;
668 err:
669 	u64_stats_update_begin(&rx->statss);
670 	if (xdp_act == XDP_TX)
671 		rx->xdp_tx_errors++;
672 	else if (xdp_act == XDP_REDIRECT)
673 		rx->xdp_redirect_errors++;
674 	u64_stats_update_end(&rx->statss);
675 	gve_free_buffer(rx, buf_state);
676 	return;
677 }
678 
679 static int gve_rx_xsk_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
680 			  struct gve_rx_buf_state_dqo *buf_state, int buf_len,
681 			  struct bpf_prog *xprog)
682 {
683 	struct xdp_buff *xdp = buf_state->xsk_buff;
684 	struct gve_priv *priv = rx->gve;
685 	int xdp_act;
686 
687 	xdp->data_end = xdp->data + buf_len;
688 	xsk_buff_dma_sync_for_cpu(xdp);
689 
690 	if (xprog) {
691 		xdp_act = bpf_prog_run_xdp(xprog, xdp);
692 		buf_len = xdp->data_end - xdp->data;
693 		if (xdp_act != XDP_PASS) {
694 			gve_xsk_done_dqo(priv, rx, xdp, xprog, xdp_act);
695 			gve_free_buf_state(rx, buf_state);
696 			return 0;
697 		}
698 	}
699 
700 	/* Copy the data to skb */
701 	rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
702 					    xdp->data, buf_len);
703 	if (unlikely(!rx->ctx.skb_head)) {
704 		xsk_buff_free(xdp);
705 		gve_free_buf_state(rx, buf_state);
706 		return -ENOMEM;
707 	}
708 	rx->ctx.skb_tail = rx->ctx.skb_head;
709 
710 	/* Free XSK buffer and Buffer state */
711 	xsk_buff_free(xdp);
712 	gve_free_buf_state(rx, buf_state);
713 
714 	/* Update Stats */
715 	u64_stats_update_begin(&rx->statss);
716 	rx->xdp_actions[XDP_PASS]++;
717 	u64_stats_update_end(&rx->statss);
718 	return 0;
719 }
720 
721 static void gve_dma_sync(struct gve_priv *priv, struct gve_rx_ring *rx,
722 			 struct gve_rx_buf_state_dqo *buf_state, u16 buf_len)
723 {
724 	struct gve_rx_slot_page_info *page_info = &buf_state->page_info;
725 
726 	if (rx->dqo.page_pool) {
727 		page_pool_dma_sync_netmem_for_cpu(rx->dqo.page_pool,
728 						  page_info->netmem,
729 						  page_info->page_offset,
730 						  buf_len);
731 	} else {
732 		dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr,
733 					      page_info->page_offset +
734 					      page_info->pad,
735 					      buf_len, DMA_FROM_DEVICE);
736 	}
737 }
738 
739 /* Returns 0 if descriptor is completed successfully.
740  * Returns -EINVAL if descriptor is invalid.
741  * Returns -ENOMEM if data cannot be copied to skb.
742  */
743 static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
744 		      const struct gve_rx_compl_desc_dqo *compl_desc,
745 		      u32 desc_idx, int queue_idx)
746 {
747 	const u16 buffer_id = le16_to_cpu(compl_desc->buf_id);
748 	const bool hbo = compl_desc->header_buffer_overflow;
749 	const bool eop = compl_desc->end_of_packet != 0;
750 	const bool hsplit = compl_desc->split_header;
751 	struct gve_rx_buf_state_dqo *buf_state;
752 	struct gve_priv *priv = rx->gve;
753 	struct bpf_prog *xprog;
754 	u16 buf_len;
755 	u16 hdr_len;
756 
757 	if (unlikely(buffer_id >= rx->dqo.num_buf_states)) {
758 		net_err_ratelimited("%s: Invalid RX buffer_id=%u\n",
759 				    priv->dev->name, buffer_id);
760 		return -EINVAL;
761 	}
762 	buf_state = &rx->dqo.buf_states[buffer_id];
763 	if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) {
764 		net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n",
765 				    priv->dev->name, buffer_id);
766 		return -EINVAL;
767 	}
768 
769 	if (unlikely(compl_desc->rx_error)) {
770 		gve_free_buffer(rx, buf_state);
771 		return -EINVAL;
772 	}
773 
774 	buf_len = compl_desc->packet_len;
775 	hdr_len = compl_desc->header_len;
776 
777 	xprog = READ_ONCE(priv->xdp_prog);
778 	if (buf_state->xsk_buff)
779 		return gve_rx_xsk_dqo(napi, rx, buf_state, buf_len, xprog);
780 
781 	/* Page might have not been used for awhile and was likely last written
782 	 * by a different thread.
783 	 */
784 	if (rx->dqo.page_pool) {
785 		if (!netmem_is_net_iov(buf_state->page_info.netmem))
786 			prefetch(netmem_to_page(buf_state->page_info.netmem));
787 	} else {
788 		prefetch(buf_state->page_info.page);
789 	}
790 
791 	/* Copy the header into the skb in the case of header split */
792 	if (hsplit) {
793 		int unsplit = 0;
794 
795 		if (hdr_len && !hbo) {
796 			rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
797 							    rx->dqo.hdr_bufs.data +
798 							    desc_idx * priv->header_buf_size,
799 							    hdr_len);
800 			if (unlikely(!rx->ctx.skb_head))
801 				goto error;
802 			rx->ctx.skb_tail = rx->ctx.skb_head;
803 
804 			if (rx->dqo.page_pool)
805 				skb_mark_for_recycle(rx->ctx.skb_head);
806 		} else {
807 			unsplit = 1;
808 		}
809 		u64_stats_update_begin(&rx->statss);
810 		rx->rx_hsplit_pkt++;
811 		rx->rx_hsplit_unsplit_pkt += unsplit;
812 		rx->rx_hsplit_bytes += hdr_len;
813 		u64_stats_update_end(&rx->statss);
814 	} else if (!rx->ctx.skb_head && rx->dqo.page_pool &&
815 		   netmem_is_net_iov(buf_state->page_info.netmem)) {
816 		/* when header split is disabled, the header went to the packet
817 		 * buffer. If the packet buffer is a net_iov, those can't be
818 		 * easily mapped into the kernel space to access the header
819 		 * required to process the packet.
820 		 */
821 		goto error;
822 	}
823 
824 	/* Sync the portion of dma buffer for CPU to read. */
825 	gve_dma_sync(priv, rx, buf_state, buf_len);
826 
827 	/* Append to current skb if one exists. */
828 	if (rx->ctx.skb_head) {
829 		if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx,
830 						 priv)) != 0) {
831 			goto error;
832 		}
833 		return 0;
834 	}
835 
836 	if (xprog) {
837 		struct xdp_buff xdp;
838 		void *old_data;
839 		int xdp_act;
840 
841 		xdp_init_buff(&xdp, buf_state->page_info.buf_size,
842 			      &rx->xdp_rxq);
843 		xdp_prepare_buff(&xdp,
844 				 buf_state->page_info.page_address +
845 				 buf_state->page_info.page_offset,
846 				 buf_state->page_info.pad,
847 				 buf_len, false);
848 		old_data = xdp.data;
849 		xdp_act = bpf_prog_run_xdp(xprog, &xdp);
850 		buf_state->page_info.pad += xdp.data - old_data;
851 		buf_len = xdp.data_end - xdp.data;
852 		if (xdp_act != XDP_PASS) {
853 			gve_xdp_done_dqo(priv, rx, &xdp, xprog, xdp_act,
854 					 buf_state);
855 			return 0;
856 		}
857 
858 		u64_stats_update_begin(&rx->statss);
859 		rx->xdp_actions[XDP_PASS]++;
860 		u64_stats_update_end(&rx->statss);
861 	}
862 
863 	if (eop && buf_len <= priv->rx_copybreak &&
864 	    !(rx->dqo.page_pool &&
865 	      netmem_is_net_iov(buf_state->page_info.netmem))) {
866 		rx->ctx.skb_head = gve_rx_copy(priv->dev, napi,
867 					       &buf_state->page_info, buf_len);
868 		if (unlikely(!rx->ctx.skb_head))
869 			goto error;
870 		rx->ctx.skb_tail = rx->ctx.skb_head;
871 
872 		u64_stats_update_begin(&rx->statss);
873 		rx->rx_copied_pkt++;
874 		rx->rx_copybreak_pkt++;
875 		u64_stats_update_end(&rx->statss);
876 
877 		gve_free_buffer(rx, buf_state);
878 		return 0;
879 	}
880 
881 	rx->ctx.skb_head = napi_get_frags(napi);
882 	if (unlikely(!rx->ctx.skb_head))
883 		goto error;
884 	rx->ctx.skb_tail = rx->ctx.skb_head;
885 
886 	if (gve_rx_should_trigger_copy_ondemand(rx)) {
887 		if (gve_rx_copy_ondemand(rx, buf_state, buf_len) < 0)
888 			goto error;
889 		return 0;
890 	}
891 
892 	if (rx->dqo.page_pool)
893 		skb_mark_for_recycle(rx->ctx.skb_head);
894 
895 	gve_skb_add_rx_frag(rx, buf_state, 0, buf_len);
896 	gve_reuse_buffer(rx, buf_state);
897 	return 0;
898 
899 error:
900 	gve_free_buffer(rx, buf_state);
901 	return -ENOMEM;
902 }
903 
904 static int gve_rx_complete_rsc(struct sk_buff *skb,
905 			       const struct gve_rx_compl_desc_dqo *desc,
906 			       struct gve_ptype ptype)
907 {
908 	struct skb_shared_info *shinfo = skb_shinfo(skb);
909 
910 	/* Only TCP is supported right now. */
911 	if (ptype.l4_type != GVE_L4_TYPE_TCP)
912 		return -EINVAL;
913 
914 	switch (ptype.l3_type) {
915 	case GVE_L3_TYPE_IPV4:
916 		shinfo->gso_type = SKB_GSO_TCPV4;
917 		break;
918 	case GVE_L3_TYPE_IPV6:
919 		shinfo->gso_type = SKB_GSO_TCPV6;
920 		break;
921 	default:
922 		return -EINVAL;
923 	}
924 
925 	shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len);
926 	return 0;
927 }
928 
929 /* Returns 0 if skb is completed successfully, -1 otherwise. */
930 static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi,
931 			       const struct gve_rx_compl_desc_dqo *desc,
932 			       netdev_features_t feat)
933 {
934 	struct gve_ptype ptype =
935 		rx->gve->ptype_lut_dqo->ptypes[desc->packet_type];
936 	int err;
937 
938 	skb_record_rx_queue(rx->ctx.skb_head, rx->q_num);
939 
940 	if (feat & NETIF_F_RXHASH)
941 		gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype);
942 
943 	if (feat & NETIF_F_RXCSUM)
944 		gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype);
945 
946 	if (rx->gve->ts_config.rx_filter == HWTSTAMP_FILTER_ALL)
947 		gve_rx_skb_hwtstamp(rx, le32_to_cpu(desc->ts));
948 
949 	/* RSC packets must set gso_size otherwise the TCP stack will complain
950 	 * that packets are larger than MTU.
951 	 */
952 	if (desc->rsc) {
953 		err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype);
954 		if (err < 0)
955 			return err;
956 	}
957 
958 	if (skb_headlen(rx->ctx.skb_head) == 0)
959 		napi_gro_frags(napi);
960 	else
961 		napi_gro_receive(napi, rx->ctx.skb_head);
962 
963 	return 0;
964 }
965 
966 int gve_rx_poll_dqo(struct gve_notify_block *block, int budget)
967 {
968 	struct gve_rx_compl_queue_dqo *complq;
969 	struct napi_struct *napi;
970 	netdev_features_t feat;
971 	struct gve_rx_ring *rx;
972 	struct gve_priv *priv;
973 	u64 xdp_redirects;
974 	u32 work_done = 0;
975 	u64 bytes = 0;
976 	u64 xdp_txs;
977 	int err;
978 
979 	napi = &block->napi;
980 	feat = napi->dev->features;
981 
982 	rx = block->rx;
983 	priv = rx->gve;
984 	complq = &rx->dqo.complq;
985 
986 	xdp_redirects = rx->xdp_actions[XDP_REDIRECT];
987 	xdp_txs = rx->xdp_actions[XDP_TX];
988 
989 	while (work_done < budget) {
990 		struct gve_rx_compl_desc_dqo *compl_desc =
991 			&complq->desc_ring[complq->head];
992 		u32 pkt_bytes;
993 
994 		/* No more new packets */
995 		if (compl_desc->generation == complq->cur_gen_bit)
996 			break;
997 
998 		/* Prefetch the next two descriptors. */
999 		prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]);
1000 		prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]);
1001 
1002 		/* Do not read data until we own the descriptor */
1003 		dma_rmb();
1004 
1005 		err = gve_rx_dqo(napi, rx, compl_desc, complq->head, rx->q_num);
1006 		if (err < 0) {
1007 			gve_rx_free_skb(napi, rx);
1008 			u64_stats_update_begin(&rx->statss);
1009 			if (err == -ENOMEM)
1010 				rx->rx_skb_alloc_fail++;
1011 			else if (err == -EINVAL)
1012 				rx->rx_desc_err_dropped_pkt++;
1013 			u64_stats_update_end(&rx->statss);
1014 		}
1015 
1016 		complq->head = (complq->head + 1) & complq->mask;
1017 		complq->num_free_slots++;
1018 
1019 		/* When the ring wraps, the generation bit is flipped. */
1020 		complq->cur_gen_bit ^= (complq->head == 0);
1021 
1022 		/* Receiving a completion means we have space to post another
1023 		 * buffer on the buffer queue.
1024 		 */
1025 		{
1026 			struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
1027 
1028 			bufq->head = (bufq->head + 1) & bufq->mask;
1029 		}
1030 
1031 		/* Free running counter of completed descriptors */
1032 		rx->cnt++;
1033 
1034 		if (!rx->ctx.skb_head)
1035 			continue;
1036 
1037 		if (!compl_desc->end_of_packet)
1038 			continue;
1039 
1040 		work_done++;
1041 		pkt_bytes = rx->ctx.skb_head->len;
1042 		/* The ethernet header (first ETH_HLEN bytes) is snipped off
1043 		 * by eth_type_trans.
1044 		 */
1045 		if (skb_headlen(rx->ctx.skb_head))
1046 			pkt_bytes += ETH_HLEN;
1047 
1048 		/* gve_rx_complete_skb() will consume skb if successful */
1049 		if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) {
1050 			gve_rx_free_skb(napi, rx);
1051 			u64_stats_update_begin(&rx->statss);
1052 			rx->rx_desc_err_dropped_pkt++;
1053 			u64_stats_update_end(&rx->statss);
1054 			continue;
1055 		}
1056 
1057 		bytes += pkt_bytes;
1058 		rx->ctx.skb_head = NULL;
1059 		rx->ctx.skb_tail = NULL;
1060 	}
1061 
1062 	if (xdp_txs != rx->xdp_actions[XDP_TX])
1063 		gve_xdp_tx_flush_dqo(priv, rx->q_num);
1064 
1065 	if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT])
1066 		xdp_do_flush();
1067 
1068 	gve_rx_post_buffers_dqo(rx);
1069 
1070 	u64_stats_update_begin(&rx->statss);
1071 	rx->rpackets += work_done;
1072 	rx->rbytes += bytes;
1073 	u64_stats_update_end(&rx->statss);
1074 
1075 	return work_done;
1076 }
1077