xref: /linux/drivers/net/ethernet/google/gve/gve_rx_dqo.c (revision 860a9bed265146b10311bcadbbcef59c3af4454d)
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6 
7 #include "gve.h"
8 #include "gve_dqo.h"
9 #include "gve_adminq.h"
10 #include "gve_utils.h"
11 #include <linux/ip.h>
12 #include <linux/ipv6.h>
13 #include <linux/skbuff.h>
14 #include <linux/slab.h>
15 #include <net/ip6_checksum.h>
16 #include <net/ipv6.h>
17 #include <net/tcp.h>
18 
19 static int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs)
20 {
21 	return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias;
22 }
23 
24 static void gve_free_page_dqo(struct gve_priv *priv,
25 			      struct gve_rx_buf_state_dqo *bs,
26 			      bool free_page)
27 {
28 	page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1);
29 	if (free_page)
30 		gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr,
31 			      DMA_FROM_DEVICE);
32 	bs->page_info.page = NULL;
33 }
34 
35 static struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx)
36 {
37 	struct gve_rx_buf_state_dqo *buf_state;
38 	s16 buffer_id;
39 
40 	buffer_id = rx->dqo.free_buf_states;
41 	if (unlikely(buffer_id == -1))
42 		return NULL;
43 
44 	buf_state = &rx->dqo.buf_states[buffer_id];
45 
46 	/* Remove buf_state from free list */
47 	rx->dqo.free_buf_states = buf_state->next;
48 
49 	/* Point buf_state to itself to mark it as allocated */
50 	buf_state->next = buffer_id;
51 
52 	return buf_state;
53 }
54 
55 static bool gve_buf_state_is_allocated(struct gve_rx_ring *rx,
56 				       struct gve_rx_buf_state_dqo *buf_state)
57 {
58 	s16 buffer_id = buf_state - rx->dqo.buf_states;
59 
60 	return buf_state->next == buffer_id;
61 }
62 
63 static void gve_free_buf_state(struct gve_rx_ring *rx,
64 			       struct gve_rx_buf_state_dqo *buf_state)
65 {
66 	s16 buffer_id = buf_state - rx->dqo.buf_states;
67 
68 	buf_state->next = rx->dqo.free_buf_states;
69 	rx->dqo.free_buf_states = buffer_id;
70 }
71 
72 static struct gve_rx_buf_state_dqo *
73 gve_dequeue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list)
74 {
75 	struct gve_rx_buf_state_dqo *buf_state;
76 	s16 buffer_id;
77 
78 	buffer_id = list->head;
79 	if (unlikely(buffer_id == -1))
80 		return NULL;
81 
82 	buf_state = &rx->dqo.buf_states[buffer_id];
83 
84 	/* Remove buf_state from list */
85 	list->head = buf_state->next;
86 	if (buf_state->next == -1)
87 		list->tail = -1;
88 
89 	/* Point buf_state to itself to mark it as allocated */
90 	buf_state->next = buffer_id;
91 
92 	return buf_state;
93 }
94 
95 static void gve_enqueue_buf_state(struct gve_rx_ring *rx,
96 				  struct gve_index_list *list,
97 				  struct gve_rx_buf_state_dqo *buf_state)
98 {
99 	s16 buffer_id = buf_state - rx->dqo.buf_states;
100 
101 	buf_state->next = -1;
102 
103 	if (list->head == -1) {
104 		list->head = buffer_id;
105 		list->tail = buffer_id;
106 	} else {
107 		int tail = list->tail;
108 
109 		rx->dqo.buf_states[tail].next = buffer_id;
110 		list->tail = buffer_id;
111 	}
112 }
113 
114 static struct gve_rx_buf_state_dqo *
115 gve_get_recycled_buf_state(struct gve_rx_ring *rx)
116 {
117 	struct gve_rx_buf_state_dqo *buf_state;
118 	int i;
119 
120 	/* Recycled buf states are immediately usable. */
121 	buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states);
122 	if (likely(buf_state))
123 		return buf_state;
124 
125 	if (unlikely(rx->dqo.used_buf_states.head == -1))
126 		return NULL;
127 
128 	/* Used buf states are only usable when ref count reaches 0, which means
129 	 * no SKBs refer to them.
130 	 *
131 	 * Search a limited number before giving up.
132 	 */
133 	for (i = 0; i < 5; i++) {
134 		buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states);
135 		if (gve_buf_ref_cnt(buf_state) == 0) {
136 			rx->dqo.used_buf_states_cnt--;
137 			return buf_state;
138 		}
139 
140 		gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state);
141 	}
142 
143 	/* For QPL, we cannot allocate any new buffers and must
144 	 * wait for the existing ones to be available.
145 	 */
146 	if (rx->dqo.qpl)
147 		return NULL;
148 
149 	/* If there are no free buf states discard an entry from
150 	 * `used_buf_states` so it can be used.
151 	 */
152 	if (unlikely(rx->dqo.free_buf_states == -1)) {
153 		buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states);
154 		if (gve_buf_ref_cnt(buf_state) == 0)
155 			return buf_state;
156 
157 		gve_free_page_dqo(rx->gve, buf_state, true);
158 		gve_free_buf_state(rx, buf_state);
159 	}
160 
161 	return NULL;
162 }
163 
164 static int gve_alloc_page_dqo(struct gve_rx_ring *rx,
165 			      struct gve_rx_buf_state_dqo *buf_state)
166 {
167 	struct gve_priv *priv = rx->gve;
168 	u32 idx;
169 
170 	if (!rx->dqo.qpl) {
171 		int err;
172 
173 		err = gve_alloc_page(priv, &priv->pdev->dev,
174 				     &buf_state->page_info.page,
175 				     &buf_state->addr,
176 				     DMA_FROM_DEVICE, GFP_ATOMIC);
177 		if (err)
178 			return err;
179 	} else {
180 		idx = rx->dqo.next_qpl_page_idx;
181 		if (idx >= gve_get_rx_pages_per_qpl_dqo(priv->rx_desc_cnt)) {
182 			net_err_ratelimited("%s: Out of QPL pages\n",
183 					    priv->dev->name);
184 			return -ENOMEM;
185 		}
186 		buf_state->page_info.page = rx->dqo.qpl->pages[idx];
187 		buf_state->addr = rx->dqo.qpl->page_buses[idx];
188 		rx->dqo.next_qpl_page_idx++;
189 	}
190 	buf_state->page_info.page_offset = 0;
191 	buf_state->page_info.page_address =
192 		page_address(buf_state->page_info.page);
193 	buf_state->last_single_ref_offset = 0;
194 
195 	/* The page already has 1 ref. */
196 	page_ref_add(buf_state->page_info.page, INT_MAX - 1);
197 	buf_state->page_info.pagecnt_bias = INT_MAX;
198 
199 	return 0;
200 }
201 
202 static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx)
203 {
204 	struct device *hdev = &priv->pdev->dev;
205 	int buf_count = rx->dqo.bufq.mask + 1;
206 
207 	if (rx->dqo.hdr_bufs.data) {
208 		dma_free_coherent(hdev, priv->header_buf_size * buf_count,
209 				  rx->dqo.hdr_bufs.data, rx->dqo.hdr_bufs.addr);
210 		rx->dqo.hdr_bufs.data = NULL;
211 	}
212 }
213 
214 void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx)
215 {
216 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
217 
218 	if (!gve_rx_was_added_to_block(priv, idx))
219 		return;
220 
221 	gve_remove_napi(priv, ntfy_idx);
222 	gve_rx_remove_from_block(priv, idx);
223 }
224 
225 static void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
226 				 struct gve_rx_alloc_rings_cfg *cfg)
227 {
228 	struct device *hdev = &priv->pdev->dev;
229 	size_t completion_queue_slots;
230 	size_t buffer_queue_slots;
231 	int idx = rx->q_num;
232 	size_t size;
233 	int i;
234 
235 	completion_queue_slots = rx->dqo.complq.mask + 1;
236 	buffer_queue_slots = rx->dqo.bufq.mask + 1;
237 
238 	if (rx->q_resources) {
239 		dma_free_coherent(hdev, sizeof(*rx->q_resources),
240 				  rx->q_resources, rx->q_resources_bus);
241 		rx->q_resources = NULL;
242 	}
243 
244 	for (i = 0; i < rx->dqo.num_buf_states; i++) {
245 		struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
246 		/* Only free page for RDA. QPL pages are freed in gve_main. */
247 		if (bs->page_info.page)
248 			gve_free_page_dqo(priv, bs, !rx->dqo.qpl);
249 	}
250 	if (rx->dqo.qpl) {
251 		gve_unassign_qpl(cfg->qpl_cfg, rx->dqo.qpl->id);
252 		rx->dqo.qpl = NULL;
253 	}
254 
255 	if (rx->dqo.bufq.desc_ring) {
256 		size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
257 		dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring,
258 				  rx->dqo.bufq.bus);
259 		rx->dqo.bufq.desc_ring = NULL;
260 	}
261 
262 	if (rx->dqo.complq.desc_ring) {
263 		size = sizeof(rx->dqo.complq.desc_ring[0]) *
264 			completion_queue_slots;
265 		dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring,
266 				  rx->dqo.complq.bus);
267 		rx->dqo.complq.desc_ring = NULL;
268 	}
269 
270 	kvfree(rx->dqo.buf_states);
271 	rx->dqo.buf_states = NULL;
272 
273 	gve_rx_free_hdr_bufs(priv, rx);
274 
275 	netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
276 }
277 
278 static int gve_rx_alloc_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx)
279 {
280 	struct device *hdev = &priv->pdev->dev;
281 	int buf_count = rx->dqo.bufq.mask + 1;
282 
283 	rx->dqo.hdr_bufs.data = dma_alloc_coherent(hdev, priv->header_buf_size * buf_count,
284 						   &rx->dqo.hdr_bufs.addr, GFP_KERNEL);
285 	if (!rx->dqo.hdr_bufs.data)
286 		return -ENOMEM;
287 
288 	return 0;
289 }
290 
291 void gve_rx_start_ring_dqo(struct gve_priv *priv, int idx)
292 {
293 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
294 
295 	gve_rx_add_to_block(priv, idx);
296 	gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo);
297 }
298 
299 static int gve_rx_alloc_ring_dqo(struct gve_priv *priv,
300 				 struct gve_rx_alloc_rings_cfg *cfg,
301 				 struct gve_rx_ring *rx,
302 				 int idx)
303 {
304 	struct device *hdev = &priv->pdev->dev;
305 	size_t size;
306 	int i;
307 
308 	const u32 buffer_queue_slots = cfg->ring_size;
309 	const u32 completion_queue_slots = cfg->ring_size;
310 
311 	netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n");
312 
313 	memset(rx, 0, sizeof(*rx));
314 	rx->gve = priv;
315 	rx->q_num = idx;
316 	rx->dqo.bufq.mask = buffer_queue_slots - 1;
317 	rx->dqo.complq.num_free_slots = completion_queue_slots;
318 	rx->dqo.complq.mask = completion_queue_slots - 1;
319 	rx->ctx.skb_head = NULL;
320 	rx->ctx.skb_tail = NULL;
321 
322 	rx->dqo.num_buf_states = cfg->raw_addressing ?
323 		min_t(s16, S16_MAX, buffer_queue_slots * 4) :
324 		gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
325 	rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states,
326 				      sizeof(rx->dqo.buf_states[0]),
327 				      GFP_KERNEL);
328 	if (!rx->dqo.buf_states)
329 		return -ENOMEM;
330 
331 	/* Allocate header buffers for header-split */
332 	if (cfg->enable_header_split)
333 		if (gve_rx_alloc_hdr_bufs(priv, rx))
334 			goto err;
335 
336 	/* Set up linked list of buffer IDs */
337 	for (i = 0; i < rx->dqo.num_buf_states - 1; i++)
338 		rx->dqo.buf_states[i].next = i + 1;
339 
340 	rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1;
341 	rx->dqo.recycled_buf_states.head = -1;
342 	rx->dqo.recycled_buf_states.tail = -1;
343 	rx->dqo.used_buf_states.head = -1;
344 	rx->dqo.used_buf_states.tail = -1;
345 
346 	/* Allocate RX completion queue */
347 	size = sizeof(rx->dqo.complq.desc_ring[0]) *
348 		completion_queue_slots;
349 	rx->dqo.complq.desc_ring =
350 		dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL);
351 	if (!rx->dqo.complq.desc_ring)
352 		goto err;
353 
354 	/* Allocate RX buffer queue */
355 	size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
356 	rx->dqo.bufq.desc_ring =
357 		dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL);
358 	if (!rx->dqo.bufq.desc_ring)
359 		goto err;
360 
361 	if (!cfg->raw_addressing) {
362 		rx->dqo.qpl = gve_assign_rx_qpl(cfg, rx->q_num);
363 		if (!rx->dqo.qpl)
364 			goto err;
365 		rx->dqo.next_qpl_page_idx = 0;
366 	}
367 
368 	rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources),
369 					     &rx->q_resources_bus, GFP_KERNEL);
370 	if (!rx->q_resources)
371 		goto err;
372 
373 	return 0;
374 
375 err:
376 	gve_rx_free_ring_dqo(priv, rx, cfg);
377 	return -ENOMEM;
378 }
379 
380 void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx)
381 {
382 	const struct gve_rx_ring *rx = &priv->rx[queue_idx];
383 	u64 index = be32_to_cpu(rx->q_resources->db_index);
384 
385 	iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]);
386 }
387 
388 int gve_rx_alloc_rings_dqo(struct gve_priv *priv,
389 			   struct gve_rx_alloc_rings_cfg *cfg)
390 {
391 	struct gve_rx_ring *rx;
392 	int err;
393 	int i;
394 
395 	if (!cfg->raw_addressing && !cfg->qpls) {
396 		netif_err(priv, drv, priv->dev,
397 			  "Cannot alloc QPL ring before allocing QPLs\n");
398 		return -EINVAL;
399 	}
400 
401 	rx = kvcalloc(cfg->qcfg->max_queues, sizeof(struct gve_rx_ring),
402 		      GFP_KERNEL);
403 	if (!rx)
404 		return -ENOMEM;
405 
406 	for (i = 0; i < cfg->qcfg->num_queues; i++) {
407 		err = gve_rx_alloc_ring_dqo(priv, cfg, &rx[i], i);
408 		if (err) {
409 			netif_err(priv, drv, priv->dev,
410 				  "Failed to alloc rx ring=%d: err=%d\n",
411 				  i, err);
412 			goto err;
413 		}
414 	}
415 
416 	cfg->rx = rx;
417 	return 0;
418 
419 err:
420 	for (i--; i >= 0; i--)
421 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
422 	kvfree(rx);
423 	return err;
424 }
425 
426 void gve_rx_free_rings_dqo(struct gve_priv *priv,
427 			   struct gve_rx_alloc_rings_cfg *cfg)
428 {
429 	struct gve_rx_ring *rx = cfg->rx;
430 	int i;
431 
432 	if (!rx)
433 		return;
434 
435 	for (i = 0; i < cfg->qcfg->num_queues;  i++)
436 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
437 
438 	kvfree(rx);
439 	cfg->rx = NULL;
440 }
441 
442 void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx)
443 {
444 	struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
445 	struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
446 	struct gve_priv *priv = rx->gve;
447 	u32 num_avail_slots;
448 	u32 num_full_slots;
449 	u32 num_posted = 0;
450 
451 	num_full_slots = (bufq->tail - bufq->head) & bufq->mask;
452 	num_avail_slots = bufq->mask - num_full_slots;
453 
454 	num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots);
455 	while (num_posted < num_avail_slots) {
456 		struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail];
457 		struct gve_rx_buf_state_dqo *buf_state;
458 
459 		buf_state = gve_get_recycled_buf_state(rx);
460 		if (unlikely(!buf_state)) {
461 			buf_state = gve_alloc_buf_state(rx);
462 			if (unlikely(!buf_state))
463 				break;
464 
465 			if (unlikely(gve_alloc_page_dqo(rx, buf_state))) {
466 				u64_stats_update_begin(&rx->statss);
467 				rx->rx_buf_alloc_fail++;
468 				u64_stats_update_end(&rx->statss);
469 				gve_free_buf_state(rx, buf_state);
470 				break;
471 			}
472 		}
473 
474 		desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states);
475 		desc->buf_addr = cpu_to_le64(buf_state->addr +
476 					     buf_state->page_info.page_offset);
477 		if (rx->dqo.hdr_bufs.data)
478 			desc->header_buf_addr =
479 				cpu_to_le64(rx->dqo.hdr_bufs.addr +
480 					    priv->header_buf_size * bufq->tail);
481 
482 		bufq->tail = (bufq->tail + 1) & bufq->mask;
483 		complq->num_free_slots--;
484 		num_posted++;
485 
486 		if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0)
487 			gve_rx_write_doorbell_dqo(priv, rx->q_num);
488 	}
489 
490 	rx->fill_cnt += num_posted;
491 }
492 
493 static void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx,
494 				struct gve_rx_buf_state_dqo *buf_state)
495 {
496 	const u16 data_buffer_size = priv->data_buffer_size_dqo;
497 	int pagecount;
498 
499 	/* Can't reuse if we only fit one buffer per page */
500 	if (data_buffer_size * 2 > PAGE_SIZE)
501 		goto mark_used;
502 
503 	pagecount = gve_buf_ref_cnt(buf_state);
504 
505 	/* Record the offset when we have a single remaining reference.
506 	 *
507 	 * When this happens, we know all of the other offsets of the page are
508 	 * usable.
509 	 */
510 	if (pagecount == 1) {
511 		buf_state->last_single_ref_offset =
512 			buf_state->page_info.page_offset;
513 	}
514 
515 	/* Use the next buffer sized chunk in the page. */
516 	buf_state->page_info.page_offset += data_buffer_size;
517 	buf_state->page_info.page_offset &= (PAGE_SIZE - 1);
518 
519 	/* If we wrap around to the same offset without ever dropping to 1
520 	 * reference, then we don't know if this offset was ever freed.
521 	 */
522 	if (buf_state->page_info.page_offset ==
523 	    buf_state->last_single_ref_offset) {
524 		goto mark_used;
525 	}
526 
527 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
528 	return;
529 
530 mark_used:
531 	gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state);
532 	rx->dqo.used_buf_states_cnt++;
533 }
534 
535 static void gve_rx_skb_csum(struct sk_buff *skb,
536 			    const struct gve_rx_compl_desc_dqo *desc,
537 			    struct gve_ptype ptype)
538 {
539 	skb->ip_summed = CHECKSUM_NONE;
540 
541 	/* HW did not identify and process L3 and L4 headers. */
542 	if (unlikely(!desc->l3_l4_processed))
543 		return;
544 
545 	if (ptype.l3_type == GVE_L3_TYPE_IPV4) {
546 		if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err))
547 			return;
548 	} else if (ptype.l3_type == GVE_L3_TYPE_IPV6) {
549 		/* Checksum should be skipped if this flag is set. */
550 		if (unlikely(desc->ipv6_ex_add))
551 			return;
552 	}
553 
554 	if (unlikely(desc->csum_l4_err))
555 		return;
556 
557 	switch (ptype.l4_type) {
558 	case GVE_L4_TYPE_TCP:
559 	case GVE_L4_TYPE_UDP:
560 	case GVE_L4_TYPE_ICMP:
561 	case GVE_L4_TYPE_SCTP:
562 		skb->ip_summed = CHECKSUM_UNNECESSARY;
563 		break;
564 	default:
565 		break;
566 	}
567 }
568 
569 static void gve_rx_skb_hash(struct sk_buff *skb,
570 			    const struct gve_rx_compl_desc_dqo *compl_desc,
571 			    struct gve_ptype ptype)
572 {
573 	enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2;
574 
575 	if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN)
576 		hash_type = PKT_HASH_TYPE_L4;
577 	else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN)
578 		hash_type = PKT_HASH_TYPE_L3;
579 
580 	skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type);
581 }
582 
583 static void gve_rx_free_skb(struct gve_rx_ring *rx)
584 {
585 	if (!rx->ctx.skb_head)
586 		return;
587 
588 	dev_kfree_skb_any(rx->ctx.skb_head);
589 	rx->ctx.skb_head = NULL;
590 	rx->ctx.skb_tail = NULL;
591 }
592 
593 static bool gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring *rx)
594 {
595 	if (!rx->dqo.qpl)
596 		return false;
597 	if (rx->dqo.used_buf_states_cnt <
598 		     (rx->dqo.num_buf_states -
599 		     GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD))
600 		return false;
601 	return true;
602 }
603 
604 static int gve_rx_copy_ondemand(struct gve_rx_ring *rx,
605 				struct gve_rx_buf_state_dqo *buf_state,
606 				u16 buf_len)
607 {
608 	struct page *page = alloc_page(GFP_ATOMIC);
609 	int num_frags;
610 
611 	if (!page)
612 		return -ENOMEM;
613 
614 	memcpy(page_address(page),
615 	       buf_state->page_info.page_address +
616 	       buf_state->page_info.page_offset,
617 	       buf_len);
618 	num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
619 	skb_add_rx_frag(rx->ctx.skb_tail, num_frags, page,
620 			0, buf_len, PAGE_SIZE);
621 
622 	u64_stats_update_begin(&rx->statss);
623 	rx->rx_frag_alloc_cnt++;
624 	u64_stats_update_end(&rx->statss);
625 	/* Return unused buffer. */
626 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
627 	return 0;
628 }
629 
630 /* Chains multi skbs for single rx packet.
631  * Returns 0 if buffer is appended, -1 otherwise.
632  */
633 static int gve_rx_append_frags(struct napi_struct *napi,
634 			       struct gve_rx_buf_state_dqo *buf_state,
635 			       u16 buf_len, struct gve_rx_ring *rx,
636 			       struct gve_priv *priv)
637 {
638 	int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
639 
640 	if (unlikely(num_frags == MAX_SKB_FRAGS)) {
641 		struct sk_buff *skb;
642 
643 		skb = napi_alloc_skb(napi, 0);
644 		if (!skb)
645 			return -1;
646 
647 		if (rx->ctx.skb_tail == rx->ctx.skb_head)
648 			skb_shinfo(rx->ctx.skb_head)->frag_list = skb;
649 		else
650 			rx->ctx.skb_tail->next = skb;
651 		rx->ctx.skb_tail = skb;
652 		num_frags = 0;
653 	}
654 	if (rx->ctx.skb_tail != rx->ctx.skb_head) {
655 		rx->ctx.skb_head->len += buf_len;
656 		rx->ctx.skb_head->data_len += buf_len;
657 		rx->ctx.skb_head->truesize += priv->data_buffer_size_dqo;
658 	}
659 
660 	/* Trigger ondemand page allocation if we are running low on buffers */
661 	if (gve_rx_should_trigger_copy_ondemand(rx))
662 		return gve_rx_copy_ondemand(rx, buf_state, buf_len);
663 
664 	skb_add_rx_frag(rx->ctx.skb_tail, num_frags,
665 			buf_state->page_info.page,
666 			buf_state->page_info.page_offset,
667 			buf_len, priv->data_buffer_size_dqo);
668 	gve_dec_pagecnt_bias(&buf_state->page_info);
669 
670 	/* Advances buffer page-offset if page is partially used.
671 	 * Marks buffer as used if page is full.
672 	 */
673 	gve_try_recycle_buf(priv, rx, buf_state);
674 	return 0;
675 }
676 
677 /* Returns 0 if descriptor is completed successfully.
678  * Returns -EINVAL if descriptor is invalid.
679  * Returns -ENOMEM if data cannot be copied to skb.
680  */
681 static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
682 		      const struct gve_rx_compl_desc_dqo *compl_desc,
683 		      u32 desc_idx, int queue_idx)
684 {
685 	const u16 buffer_id = le16_to_cpu(compl_desc->buf_id);
686 	const bool hbo = compl_desc->header_buffer_overflow;
687 	const bool eop = compl_desc->end_of_packet != 0;
688 	const bool hsplit = compl_desc->split_header;
689 	struct gve_rx_buf_state_dqo *buf_state;
690 	struct gve_priv *priv = rx->gve;
691 	u16 buf_len;
692 	u16 hdr_len;
693 
694 	if (unlikely(buffer_id >= rx->dqo.num_buf_states)) {
695 		net_err_ratelimited("%s: Invalid RX buffer_id=%u\n",
696 				    priv->dev->name, buffer_id);
697 		return -EINVAL;
698 	}
699 	buf_state = &rx->dqo.buf_states[buffer_id];
700 	if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) {
701 		net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n",
702 				    priv->dev->name, buffer_id);
703 		return -EINVAL;
704 	}
705 
706 	if (unlikely(compl_desc->rx_error)) {
707 		gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states,
708 				      buf_state);
709 		return -EINVAL;
710 	}
711 
712 	buf_len = compl_desc->packet_len;
713 	hdr_len = compl_desc->header_len;
714 
715 	/* Page might have not been used for awhile and was likely last written
716 	 * by a different thread.
717 	 */
718 	prefetch(buf_state->page_info.page);
719 
720 	/* Copy the header into the skb in the case of header split */
721 	if (hsplit) {
722 		int unsplit = 0;
723 
724 		if (hdr_len && !hbo) {
725 			rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
726 							    rx->dqo.hdr_bufs.data +
727 							    desc_idx * priv->header_buf_size,
728 							    hdr_len);
729 			if (unlikely(!rx->ctx.skb_head))
730 				goto error;
731 			rx->ctx.skb_tail = rx->ctx.skb_head;
732 		} else {
733 			unsplit = 1;
734 		}
735 		u64_stats_update_begin(&rx->statss);
736 		rx->rx_hsplit_pkt++;
737 		rx->rx_hsplit_unsplit_pkt += unsplit;
738 		rx->rx_hsplit_bytes += hdr_len;
739 		u64_stats_update_end(&rx->statss);
740 	}
741 
742 	/* Sync the portion of dma buffer for CPU to read. */
743 	dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr,
744 				      buf_state->page_info.page_offset,
745 				      buf_len, DMA_FROM_DEVICE);
746 
747 	/* Append to current skb if one exists. */
748 	if (rx->ctx.skb_head) {
749 		if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx,
750 						 priv)) != 0) {
751 			goto error;
752 		}
753 		return 0;
754 	}
755 
756 	if (eop && buf_len <= priv->rx_copybreak) {
757 		rx->ctx.skb_head = gve_rx_copy(priv->dev, napi,
758 					       &buf_state->page_info, buf_len);
759 		if (unlikely(!rx->ctx.skb_head))
760 			goto error;
761 		rx->ctx.skb_tail = rx->ctx.skb_head;
762 
763 		u64_stats_update_begin(&rx->statss);
764 		rx->rx_copied_pkt++;
765 		rx->rx_copybreak_pkt++;
766 		u64_stats_update_end(&rx->statss);
767 
768 		gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states,
769 				      buf_state);
770 		return 0;
771 	}
772 
773 	rx->ctx.skb_head = napi_get_frags(napi);
774 	if (unlikely(!rx->ctx.skb_head))
775 		goto error;
776 	rx->ctx.skb_tail = rx->ctx.skb_head;
777 
778 	if (gve_rx_should_trigger_copy_ondemand(rx)) {
779 		if (gve_rx_copy_ondemand(rx, buf_state, buf_len) < 0)
780 			goto error;
781 		return 0;
782 	}
783 
784 	skb_add_rx_frag(rx->ctx.skb_head, 0, buf_state->page_info.page,
785 			buf_state->page_info.page_offset, buf_len,
786 			priv->data_buffer_size_dqo);
787 	gve_dec_pagecnt_bias(&buf_state->page_info);
788 
789 	gve_try_recycle_buf(priv, rx, buf_state);
790 	return 0;
791 
792 error:
793 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
794 	return -ENOMEM;
795 }
796 
797 static int gve_rx_complete_rsc(struct sk_buff *skb,
798 			       const struct gve_rx_compl_desc_dqo *desc,
799 			       struct gve_ptype ptype)
800 {
801 	struct skb_shared_info *shinfo = skb_shinfo(skb);
802 
803 	/* Only TCP is supported right now. */
804 	if (ptype.l4_type != GVE_L4_TYPE_TCP)
805 		return -EINVAL;
806 
807 	switch (ptype.l3_type) {
808 	case GVE_L3_TYPE_IPV4:
809 		shinfo->gso_type = SKB_GSO_TCPV4;
810 		break;
811 	case GVE_L3_TYPE_IPV6:
812 		shinfo->gso_type = SKB_GSO_TCPV6;
813 		break;
814 	default:
815 		return -EINVAL;
816 	}
817 
818 	shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len);
819 	return 0;
820 }
821 
822 /* Returns 0 if skb is completed successfully, -1 otherwise. */
823 static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi,
824 			       const struct gve_rx_compl_desc_dqo *desc,
825 			       netdev_features_t feat)
826 {
827 	struct gve_ptype ptype =
828 		rx->gve->ptype_lut_dqo->ptypes[desc->packet_type];
829 	int err;
830 
831 	skb_record_rx_queue(rx->ctx.skb_head, rx->q_num);
832 
833 	if (feat & NETIF_F_RXHASH)
834 		gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype);
835 
836 	if (feat & NETIF_F_RXCSUM)
837 		gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype);
838 
839 	/* RSC packets must set gso_size otherwise the TCP stack will complain
840 	 * that packets are larger than MTU.
841 	 */
842 	if (desc->rsc) {
843 		err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype);
844 		if (err < 0)
845 			return err;
846 	}
847 
848 	if (skb_headlen(rx->ctx.skb_head) == 0)
849 		napi_gro_frags(napi);
850 	else
851 		napi_gro_receive(napi, rx->ctx.skb_head);
852 
853 	return 0;
854 }
855 
856 int gve_rx_poll_dqo(struct gve_notify_block *block, int budget)
857 {
858 	struct napi_struct *napi = &block->napi;
859 	netdev_features_t feat = napi->dev->features;
860 
861 	struct gve_rx_ring *rx = block->rx;
862 	struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
863 
864 	u32 work_done = 0;
865 	u64 bytes = 0;
866 	int err;
867 
868 	while (work_done < budget) {
869 		struct gve_rx_compl_desc_dqo *compl_desc =
870 			&complq->desc_ring[complq->head];
871 		u32 pkt_bytes;
872 
873 		/* No more new packets */
874 		if (compl_desc->generation == complq->cur_gen_bit)
875 			break;
876 
877 		/* Prefetch the next two descriptors. */
878 		prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]);
879 		prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]);
880 
881 		/* Do not read data until we own the descriptor */
882 		dma_rmb();
883 
884 		err = gve_rx_dqo(napi, rx, compl_desc, complq->head, rx->q_num);
885 		if (err < 0) {
886 			gve_rx_free_skb(rx);
887 			u64_stats_update_begin(&rx->statss);
888 			if (err == -ENOMEM)
889 				rx->rx_skb_alloc_fail++;
890 			else if (err == -EINVAL)
891 				rx->rx_desc_err_dropped_pkt++;
892 			u64_stats_update_end(&rx->statss);
893 		}
894 
895 		complq->head = (complq->head + 1) & complq->mask;
896 		complq->num_free_slots++;
897 
898 		/* When the ring wraps, the generation bit is flipped. */
899 		complq->cur_gen_bit ^= (complq->head == 0);
900 
901 		/* Receiving a completion means we have space to post another
902 		 * buffer on the buffer queue.
903 		 */
904 		{
905 			struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
906 
907 			bufq->head = (bufq->head + 1) & bufq->mask;
908 		}
909 
910 		/* Free running counter of completed descriptors */
911 		rx->cnt++;
912 
913 		if (!rx->ctx.skb_head)
914 			continue;
915 
916 		if (!compl_desc->end_of_packet)
917 			continue;
918 
919 		work_done++;
920 		pkt_bytes = rx->ctx.skb_head->len;
921 		/* The ethernet header (first ETH_HLEN bytes) is snipped off
922 		 * by eth_type_trans.
923 		 */
924 		if (skb_headlen(rx->ctx.skb_head))
925 			pkt_bytes += ETH_HLEN;
926 
927 		/* gve_rx_complete_skb() will consume skb if successful */
928 		if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) {
929 			gve_rx_free_skb(rx);
930 			u64_stats_update_begin(&rx->statss);
931 			rx->rx_desc_err_dropped_pkt++;
932 			u64_stats_update_end(&rx->statss);
933 			continue;
934 		}
935 
936 		bytes += pkt_bytes;
937 		rx->ctx.skb_head = NULL;
938 		rx->ctx.skb_tail = NULL;
939 	}
940 
941 	gve_rx_post_buffers_dqo(rx);
942 
943 	u64_stats_update_begin(&rx->statss);
944 	rx->rpackets += work_done;
945 	rx->rbytes += bytes;
946 	u64_stats_update_end(&rx->statss);
947 
948 	return work_done;
949 }
950