xref: /linux/drivers/net/ethernet/google/gve/gve_rx_dqo.c (revision eb01fe7abbe2d0b38824d2a93fdb4cc3eaf2ccc1)
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6 
7 #include "gve.h"
8 #include "gve_dqo.h"
9 #include "gve_adminq.h"
10 #include "gve_utils.h"
11 #include <linux/ip.h>
12 #include <linux/ipv6.h>
13 #include <linux/skbuff.h>
14 #include <linux/slab.h>
15 #include <net/ip6_checksum.h>
16 #include <net/ipv6.h>
17 #include <net/tcp.h>
18 
19 static int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs)
20 {
21 	return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias;
22 }
23 
24 static void gve_free_page_dqo(struct gve_priv *priv,
25 			      struct gve_rx_buf_state_dqo *bs,
26 			      bool free_page)
27 {
28 	page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1);
29 	if (free_page)
30 		gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr,
31 			      DMA_FROM_DEVICE);
32 	bs->page_info.page = NULL;
33 }
34 
35 static struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx)
36 {
37 	struct gve_rx_buf_state_dqo *buf_state;
38 	s16 buffer_id;
39 
40 	buffer_id = rx->dqo.free_buf_states;
41 	if (unlikely(buffer_id == -1))
42 		return NULL;
43 
44 	buf_state = &rx->dqo.buf_states[buffer_id];
45 
46 	/* Remove buf_state from free list */
47 	rx->dqo.free_buf_states = buf_state->next;
48 
49 	/* Point buf_state to itself to mark it as allocated */
50 	buf_state->next = buffer_id;
51 
52 	return buf_state;
53 }
54 
55 static bool gve_buf_state_is_allocated(struct gve_rx_ring *rx,
56 				       struct gve_rx_buf_state_dqo *buf_state)
57 {
58 	s16 buffer_id = buf_state - rx->dqo.buf_states;
59 
60 	return buf_state->next == buffer_id;
61 }
62 
63 static void gve_free_buf_state(struct gve_rx_ring *rx,
64 			       struct gve_rx_buf_state_dqo *buf_state)
65 {
66 	s16 buffer_id = buf_state - rx->dqo.buf_states;
67 
68 	buf_state->next = rx->dqo.free_buf_states;
69 	rx->dqo.free_buf_states = buffer_id;
70 }
71 
72 static struct gve_rx_buf_state_dqo *
73 gve_dequeue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list)
74 {
75 	struct gve_rx_buf_state_dqo *buf_state;
76 	s16 buffer_id;
77 
78 	buffer_id = list->head;
79 	if (unlikely(buffer_id == -1))
80 		return NULL;
81 
82 	buf_state = &rx->dqo.buf_states[buffer_id];
83 
84 	/* Remove buf_state from list */
85 	list->head = buf_state->next;
86 	if (buf_state->next == -1)
87 		list->tail = -1;
88 
89 	/* Point buf_state to itself to mark it as allocated */
90 	buf_state->next = buffer_id;
91 
92 	return buf_state;
93 }
94 
95 static void gve_enqueue_buf_state(struct gve_rx_ring *rx,
96 				  struct gve_index_list *list,
97 				  struct gve_rx_buf_state_dqo *buf_state)
98 {
99 	s16 buffer_id = buf_state - rx->dqo.buf_states;
100 
101 	buf_state->next = -1;
102 
103 	if (list->head == -1) {
104 		list->head = buffer_id;
105 		list->tail = buffer_id;
106 	} else {
107 		int tail = list->tail;
108 
109 		rx->dqo.buf_states[tail].next = buffer_id;
110 		list->tail = buffer_id;
111 	}
112 }
113 
114 static struct gve_rx_buf_state_dqo *
115 gve_get_recycled_buf_state(struct gve_rx_ring *rx)
116 {
117 	struct gve_rx_buf_state_dqo *buf_state;
118 	int i;
119 
120 	/* Recycled buf states are immediately usable. */
121 	buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states);
122 	if (likely(buf_state))
123 		return buf_state;
124 
125 	if (unlikely(rx->dqo.used_buf_states.head == -1))
126 		return NULL;
127 
128 	/* Used buf states are only usable when ref count reaches 0, which means
129 	 * no SKBs refer to them.
130 	 *
131 	 * Search a limited number before giving up.
132 	 */
133 	for (i = 0; i < 5; i++) {
134 		buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states);
135 		if (gve_buf_ref_cnt(buf_state) == 0) {
136 			rx->dqo.used_buf_states_cnt--;
137 			return buf_state;
138 		}
139 
140 		gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state);
141 	}
142 
143 	/* For QPL, we cannot allocate any new buffers and must
144 	 * wait for the existing ones to be available.
145 	 */
146 	if (rx->dqo.qpl)
147 		return NULL;
148 
149 	/* If there are no free buf states discard an entry from
150 	 * `used_buf_states` so it can be used.
151 	 */
152 	if (unlikely(rx->dqo.free_buf_states == -1)) {
153 		buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states);
154 		if (gve_buf_ref_cnt(buf_state) == 0)
155 			return buf_state;
156 
157 		gve_free_page_dqo(rx->gve, buf_state, true);
158 		gve_free_buf_state(rx, buf_state);
159 	}
160 
161 	return NULL;
162 }
163 
164 static int gve_alloc_page_dqo(struct gve_rx_ring *rx,
165 			      struct gve_rx_buf_state_dqo *buf_state)
166 {
167 	struct gve_priv *priv = rx->gve;
168 	u32 idx;
169 
170 	if (!rx->dqo.qpl) {
171 		int err;
172 
173 		err = gve_alloc_page(priv, &priv->pdev->dev,
174 				     &buf_state->page_info.page,
175 				     &buf_state->addr,
176 				     DMA_FROM_DEVICE, GFP_ATOMIC);
177 		if (err)
178 			return err;
179 	} else {
180 		idx = rx->dqo.next_qpl_page_idx;
181 		if (idx >= priv->rx_pages_per_qpl) {
182 			net_err_ratelimited("%s: Out of QPL pages\n",
183 					    priv->dev->name);
184 			return -ENOMEM;
185 		}
186 		buf_state->page_info.page = rx->dqo.qpl->pages[idx];
187 		buf_state->addr = rx->dqo.qpl->page_buses[idx];
188 		rx->dqo.next_qpl_page_idx++;
189 	}
190 	buf_state->page_info.page_offset = 0;
191 	buf_state->page_info.page_address =
192 		page_address(buf_state->page_info.page);
193 	buf_state->last_single_ref_offset = 0;
194 
195 	/* The page already has 1 ref. */
196 	page_ref_add(buf_state->page_info.page, INT_MAX - 1);
197 	buf_state->page_info.pagecnt_bias = INT_MAX;
198 
199 	return 0;
200 }
201 
202 static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx)
203 {
204 	struct device *hdev = &priv->pdev->dev;
205 	int buf_count = rx->dqo.bufq.mask + 1;
206 
207 	if (rx->dqo.hdr_bufs.data) {
208 		dma_free_coherent(hdev, priv->header_buf_size * buf_count,
209 				  rx->dqo.hdr_bufs.data, rx->dqo.hdr_bufs.addr);
210 		rx->dqo.hdr_bufs.data = NULL;
211 	}
212 }
213 
214 void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx)
215 {
216 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
217 
218 	if (!gve_rx_was_added_to_block(priv, idx))
219 		return;
220 
221 	gve_remove_napi(priv, ntfy_idx);
222 	gve_rx_remove_from_block(priv, idx);
223 }
224 
225 static void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
226 				 struct gve_rx_alloc_rings_cfg *cfg)
227 {
228 	struct device *hdev = &priv->pdev->dev;
229 	size_t completion_queue_slots;
230 	size_t buffer_queue_slots;
231 	int idx = rx->q_num;
232 	size_t size;
233 	int i;
234 
235 	completion_queue_slots = rx->dqo.complq.mask + 1;
236 	buffer_queue_slots = rx->dqo.bufq.mask + 1;
237 
238 	if (rx->q_resources) {
239 		dma_free_coherent(hdev, sizeof(*rx->q_resources),
240 				  rx->q_resources, rx->q_resources_bus);
241 		rx->q_resources = NULL;
242 	}
243 
244 	for (i = 0; i < rx->dqo.num_buf_states; i++) {
245 		struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
246 		/* Only free page for RDA. QPL pages are freed in gve_main. */
247 		if (bs->page_info.page)
248 			gve_free_page_dqo(priv, bs, !rx->dqo.qpl);
249 	}
250 	if (rx->dqo.qpl) {
251 		gve_unassign_qpl(cfg->qpl_cfg, rx->dqo.qpl->id);
252 		rx->dqo.qpl = NULL;
253 	}
254 
255 	if (rx->dqo.bufq.desc_ring) {
256 		size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
257 		dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring,
258 				  rx->dqo.bufq.bus);
259 		rx->dqo.bufq.desc_ring = NULL;
260 	}
261 
262 	if (rx->dqo.complq.desc_ring) {
263 		size = sizeof(rx->dqo.complq.desc_ring[0]) *
264 			completion_queue_slots;
265 		dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring,
266 				  rx->dqo.complq.bus);
267 		rx->dqo.complq.desc_ring = NULL;
268 	}
269 
270 	kvfree(rx->dqo.buf_states);
271 	rx->dqo.buf_states = NULL;
272 
273 	gve_rx_free_hdr_bufs(priv, rx);
274 
275 	netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
276 }
277 
278 static int gve_rx_alloc_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx)
279 {
280 	struct device *hdev = &priv->pdev->dev;
281 	int buf_count = rx->dqo.bufq.mask + 1;
282 
283 	rx->dqo.hdr_bufs.data = dma_alloc_coherent(hdev, priv->header_buf_size * buf_count,
284 						   &rx->dqo.hdr_bufs.addr, GFP_KERNEL);
285 	if (!rx->dqo.hdr_bufs.data)
286 		return -ENOMEM;
287 
288 	return 0;
289 }
290 
291 void gve_rx_start_ring_dqo(struct gve_priv *priv, int idx)
292 {
293 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
294 
295 	gve_rx_add_to_block(priv, idx);
296 	gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo);
297 }
298 
299 static int gve_rx_alloc_ring_dqo(struct gve_priv *priv,
300 				 struct gve_rx_alloc_rings_cfg *cfg,
301 				 struct gve_rx_ring *rx,
302 				 int idx)
303 {
304 	struct device *hdev = &priv->pdev->dev;
305 	size_t size;
306 	int i;
307 
308 	const u32 buffer_queue_slots = cfg->raw_addressing ?
309 		priv->options_dqo_rda.rx_buff_ring_entries : cfg->ring_size;
310 	const u32 completion_queue_slots = cfg->ring_size;
311 
312 	netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n");
313 
314 	memset(rx, 0, sizeof(*rx));
315 	rx->gve = priv;
316 	rx->q_num = idx;
317 	rx->dqo.bufq.mask = buffer_queue_slots - 1;
318 	rx->dqo.complq.num_free_slots = completion_queue_slots;
319 	rx->dqo.complq.mask = completion_queue_slots - 1;
320 	rx->ctx.skb_head = NULL;
321 	rx->ctx.skb_tail = NULL;
322 
323 	rx->dqo.num_buf_states = cfg->raw_addressing ?
324 		min_t(s16, S16_MAX, buffer_queue_slots * 4) :
325 		priv->rx_pages_per_qpl;
326 	rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states,
327 				      sizeof(rx->dqo.buf_states[0]),
328 				      GFP_KERNEL);
329 	if (!rx->dqo.buf_states)
330 		return -ENOMEM;
331 
332 	/* Allocate header buffers for header-split */
333 	if (cfg->enable_header_split)
334 		if (gve_rx_alloc_hdr_bufs(priv, rx))
335 			goto err;
336 
337 	/* Set up linked list of buffer IDs */
338 	for (i = 0; i < rx->dqo.num_buf_states - 1; i++)
339 		rx->dqo.buf_states[i].next = i + 1;
340 
341 	rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1;
342 	rx->dqo.recycled_buf_states.head = -1;
343 	rx->dqo.recycled_buf_states.tail = -1;
344 	rx->dqo.used_buf_states.head = -1;
345 	rx->dqo.used_buf_states.tail = -1;
346 
347 	/* Allocate RX completion queue */
348 	size = sizeof(rx->dqo.complq.desc_ring[0]) *
349 		completion_queue_slots;
350 	rx->dqo.complq.desc_ring =
351 		dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL);
352 	if (!rx->dqo.complq.desc_ring)
353 		goto err;
354 
355 	/* Allocate RX buffer queue */
356 	size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
357 	rx->dqo.bufq.desc_ring =
358 		dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL);
359 	if (!rx->dqo.bufq.desc_ring)
360 		goto err;
361 
362 	if (!cfg->raw_addressing) {
363 		rx->dqo.qpl = gve_assign_rx_qpl(cfg, rx->q_num);
364 		if (!rx->dqo.qpl)
365 			goto err;
366 		rx->dqo.next_qpl_page_idx = 0;
367 	}
368 
369 	rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources),
370 					     &rx->q_resources_bus, GFP_KERNEL);
371 	if (!rx->q_resources)
372 		goto err;
373 
374 	return 0;
375 
376 err:
377 	gve_rx_free_ring_dqo(priv, rx, cfg);
378 	return -ENOMEM;
379 }
380 
381 void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx)
382 {
383 	const struct gve_rx_ring *rx = &priv->rx[queue_idx];
384 	u64 index = be32_to_cpu(rx->q_resources->db_index);
385 
386 	iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]);
387 }
388 
389 int gve_rx_alloc_rings_dqo(struct gve_priv *priv,
390 			   struct gve_rx_alloc_rings_cfg *cfg)
391 {
392 	struct gve_rx_ring *rx;
393 	int err;
394 	int i;
395 
396 	if (!cfg->raw_addressing && !cfg->qpls) {
397 		netif_err(priv, drv, priv->dev,
398 			  "Cannot alloc QPL ring before allocing QPLs\n");
399 		return -EINVAL;
400 	}
401 
402 	rx = kvcalloc(cfg->qcfg->max_queues, sizeof(struct gve_rx_ring),
403 		      GFP_KERNEL);
404 	if (!rx)
405 		return -ENOMEM;
406 
407 	for (i = 0; i < cfg->qcfg->num_queues; i++) {
408 		err = gve_rx_alloc_ring_dqo(priv, cfg, &rx[i], i);
409 		if (err) {
410 			netif_err(priv, drv, priv->dev,
411 				  "Failed to alloc rx ring=%d: err=%d\n",
412 				  i, err);
413 			goto err;
414 		}
415 	}
416 
417 	cfg->rx = rx;
418 	return 0;
419 
420 err:
421 	for (i--; i >= 0; i--)
422 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
423 	kvfree(rx);
424 	return err;
425 }
426 
427 void gve_rx_free_rings_dqo(struct gve_priv *priv,
428 			   struct gve_rx_alloc_rings_cfg *cfg)
429 {
430 	struct gve_rx_ring *rx = cfg->rx;
431 	int i;
432 
433 	if (!rx)
434 		return;
435 
436 	for (i = 0; i < cfg->qcfg->num_queues;  i++)
437 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
438 
439 	kvfree(rx);
440 	cfg->rx = NULL;
441 }
442 
443 void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx)
444 {
445 	struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
446 	struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
447 	struct gve_priv *priv = rx->gve;
448 	u32 num_avail_slots;
449 	u32 num_full_slots;
450 	u32 num_posted = 0;
451 
452 	num_full_slots = (bufq->tail - bufq->head) & bufq->mask;
453 	num_avail_slots = bufq->mask - num_full_slots;
454 
455 	num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots);
456 	while (num_posted < num_avail_slots) {
457 		struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail];
458 		struct gve_rx_buf_state_dqo *buf_state;
459 
460 		buf_state = gve_get_recycled_buf_state(rx);
461 		if (unlikely(!buf_state)) {
462 			buf_state = gve_alloc_buf_state(rx);
463 			if (unlikely(!buf_state))
464 				break;
465 
466 			if (unlikely(gve_alloc_page_dqo(rx, buf_state))) {
467 				u64_stats_update_begin(&rx->statss);
468 				rx->rx_buf_alloc_fail++;
469 				u64_stats_update_end(&rx->statss);
470 				gve_free_buf_state(rx, buf_state);
471 				break;
472 			}
473 		}
474 
475 		desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states);
476 		desc->buf_addr = cpu_to_le64(buf_state->addr +
477 					     buf_state->page_info.page_offset);
478 		if (rx->dqo.hdr_bufs.data)
479 			desc->header_buf_addr =
480 				cpu_to_le64(rx->dqo.hdr_bufs.addr +
481 					    priv->header_buf_size * bufq->tail);
482 
483 		bufq->tail = (bufq->tail + 1) & bufq->mask;
484 		complq->num_free_slots--;
485 		num_posted++;
486 
487 		if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0)
488 			gve_rx_write_doorbell_dqo(priv, rx->q_num);
489 	}
490 
491 	rx->fill_cnt += num_posted;
492 }
493 
494 static void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx,
495 				struct gve_rx_buf_state_dqo *buf_state)
496 {
497 	const u16 data_buffer_size = priv->data_buffer_size_dqo;
498 	int pagecount;
499 
500 	/* Can't reuse if we only fit one buffer per page */
501 	if (data_buffer_size * 2 > PAGE_SIZE)
502 		goto mark_used;
503 
504 	pagecount = gve_buf_ref_cnt(buf_state);
505 
506 	/* Record the offset when we have a single remaining reference.
507 	 *
508 	 * When this happens, we know all of the other offsets of the page are
509 	 * usable.
510 	 */
511 	if (pagecount == 1) {
512 		buf_state->last_single_ref_offset =
513 			buf_state->page_info.page_offset;
514 	}
515 
516 	/* Use the next buffer sized chunk in the page. */
517 	buf_state->page_info.page_offset += data_buffer_size;
518 	buf_state->page_info.page_offset &= (PAGE_SIZE - 1);
519 
520 	/* If we wrap around to the same offset without ever dropping to 1
521 	 * reference, then we don't know if this offset was ever freed.
522 	 */
523 	if (buf_state->page_info.page_offset ==
524 	    buf_state->last_single_ref_offset) {
525 		goto mark_used;
526 	}
527 
528 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
529 	return;
530 
531 mark_used:
532 	gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state);
533 	rx->dqo.used_buf_states_cnt++;
534 }
535 
536 static void gve_rx_skb_csum(struct sk_buff *skb,
537 			    const struct gve_rx_compl_desc_dqo *desc,
538 			    struct gve_ptype ptype)
539 {
540 	skb->ip_summed = CHECKSUM_NONE;
541 
542 	/* HW did not identify and process L3 and L4 headers. */
543 	if (unlikely(!desc->l3_l4_processed))
544 		return;
545 
546 	if (ptype.l3_type == GVE_L3_TYPE_IPV4) {
547 		if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err))
548 			return;
549 	} else if (ptype.l3_type == GVE_L3_TYPE_IPV6) {
550 		/* Checksum should be skipped if this flag is set. */
551 		if (unlikely(desc->ipv6_ex_add))
552 			return;
553 	}
554 
555 	if (unlikely(desc->csum_l4_err))
556 		return;
557 
558 	switch (ptype.l4_type) {
559 	case GVE_L4_TYPE_TCP:
560 	case GVE_L4_TYPE_UDP:
561 	case GVE_L4_TYPE_ICMP:
562 	case GVE_L4_TYPE_SCTP:
563 		skb->ip_summed = CHECKSUM_UNNECESSARY;
564 		break;
565 	default:
566 		break;
567 	}
568 }
569 
570 static void gve_rx_skb_hash(struct sk_buff *skb,
571 			    const struct gve_rx_compl_desc_dqo *compl_desc,
572 			    struct gve_ptype ptype)
573 {
574 	enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2;
575 
576 	if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN)
577 		hash_type = PKT_HASH_TYPE_L4;
578 	else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN)
579 		hash_type = PKT_HASH_TYPE_L3;
580 
581 	skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type);
582 }
583 
584 static void gve_rx_free_skb(struct gve_rx_ring *rx)
585 {
586 	if (!rx->ctx.skb_head)
587 		return;
588 
589 	dev_kfree_skb_any(rx->ctx.skb_head);
590 	rx->ctx.skb_head = NULL;
591 	rx->ctx.skb_tail = NULL;
592 }
593 
594 static bool gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring *rx)
595 {
596 	if (!rx->dqo.qpl)
597 		return false;
598 	if (rx->dqo.used_buf_states_cnt <
599 		     (rx->dqo.num_buf_states -
600 		     GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD))
601 		return false;
602 	return true;
603 }
604 
605 static int gve_rx_copy_ondemand(struct gve_rx_ring *rx,
606 				struct gve_rx_buf_state_dqo *buf_state,
607 				u16 buf_len)
608 {
609 	struct page *page = alloc_page(GFP_ATOMIC);
610 	int num_frags;
611 
612 	if (!page)
613 		return -ENOMEM;
614 
615 	memcpy(page_address(page),
616 	       buf_state->page_info.page_address +
617 	       buf_state->page_info.page_offset,
618 	       buf_len);
619 	num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
620 	skb_add_rx_frag(rx->ctx.skb_tail, num_frags, page,
621 			0, buf_len, PAGE_SIZE);
622 
623 	u64_stats_update_begin(&rx->statss);
624 	rx->rx_frag_alloc_cnt++;
625 	u64_stats_update_end(&rx->statss);
626 	/* Return unused buffer. */
627 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
628 	return 0;
629 }
630 
631 /* Chains multi skbs for single rx packet.
632  * Returns 0 if buffer is appended, -1 otherwise.
633  */
634 static int gve_rx_append_frags(struct napi_struct *napi,
635 			       struct gve_rx_buf_state_dqo *buf_state,
636 			       u16 buf_len, struct gve_rx_ring *rx,
637 			       struct gve_priv *priv)
638 {
639 	int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
640 
641 	if (unlikely(num_frags == MAX_SKB_FRAGS)) {
642 		struct sk_buff *skb;
643 
644 		skb = napi_alloc_skb(napi, 0);
645 		if (!skb)
646 			return -1;
647 
648 		if (rx->ctx.skb_tail == rx->ctx.skb_head)
649 			skb_shinfo(rx->ctx.skb_head)->frag_list = skb;
650 		else
651 			rx->ctx.skb_tail->next = skb;
652 		rx->ctx.skb_tail = skb;
653 		num_frags = 0;
654 	}
655 	if (rx->ctx.skb_tail != rx->ctx.skb_head) {
656 		rx->ctx.skb_head->len += buf_len;
657 		rx->ctx.skb_head->data_len += buf_len;
658 		rx->ctx.skb_head->truesize += priv->data_buffer_size_dqo;
659 	}
660 
661 	/* Trigger ondemand page allocation if we are running low on buffers */
662 	if (gve_rx_should_trigger_copy_ondemand(rx))
663 		return gve_rx_copy_ondemand(rx, buf_state, buf_len);
664 
665 	skb_add_rx_frag(rx->ctx.skb_tail, num_frags,
666 			buf_state->page_info.page,
667 			buf_state->page_info.page_offset,
668 			buf_len, priv->data_buffer_size_dqo);
669 	gve_dec_pagecnt_bias(&buf_state->page_info);
670 
671 	/* Advances buffer page-offset if page is partially used.
672 	 * Marks buffer as used if page is full.
673 	 */
674 	gve_try_recycle_buf(priv, rx, buf_state);
675 	return 0;
676 }
677 
678 /* Returns 0 if descriptor is completed successfully.
679  * Returns -EINVAL if descriptor is invalid.
680  * Returns -ENOMEM if data cannot be copied to skb.
681  */
682 static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
683 		      const struct gve_rx_compl_desc_dqo *compl_desc,
684 		      u32 desc_idx, int queue_idx)
685 {
686 	const u16 buffer_id = le16_to_cpu(compl_desc->buf_id);
687 	const bool hbo = compl_desc->header_buffer_overflow;
688 	const bool eop = compl_desc->end_of_packet != 0;
689 	const bool hsplit = compl_desc->split_header;
690 	struct gve_rx_buf_state_dqo *buf_state;
691 	struct gve_priv *priv = rx->gve;
692 	u16 buf_len;
693 	u16 hdr_len;
694 
695 	if (unlikely(buffer_id >= rx->dqo.num_buf_states)) {
696 		net_err_ratelimited("%s: Invalid RX buffer_id=%u\n",
697 				    priv->dev->name, buffer_id);
698 		return -EINVAL;
699 	}
700 	buf_state = &rx->dqo.buf_states[buffer_id];
701 	if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) {
702 		net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n",
703 				    priv->dev->name, buffer_id);
704 		return -EINVAL;
705 	}
706 
707 	if (unlikely(compl_desc->rx_error)) {
708 		gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states,
709 				      buf_state);
710 		return -EINVAL;
711 	}
712 
713 	buf_len = compl_desc->packet_len;
714 	hdr_len = compl_desc->header_len;
715 
716 	/* Page might have not been used for awhile and was likely last written
717 	 * by a different thread.
718 	 */
719 	prefetch(buf_state->page_info.page);
720 
721 	/* Copy the header into the skb in the case of header split */
722 	if (hsplit) {
723 		int unsplit = 0;
724 
725 		if (hdr_len && !hbo) {
726 			rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
727 							    rx->dqo.hdr_bufs.data +
728 							    desc_idx * priv->header_buf_size,
729 							    hdr_len);
730 			if (unlikely(!rx->ctx.skb_head))
731 				goto error;
732 			rx->ctx.skb_tail = rx->ctx.skb_head;
733 		} else {
734 			unsplit = 1;
735 		}
736 		u64_stats_update_begin(&rx->statss);
737 		rx->rx_hsplit_pkt++;
738 		rx->rx_hsplit_unsplit_pkt += unsplit;
739 		rx->rx_hsplit_bytes += hdr_len;
740 		u64_stats_update_end(&rx->statss);
741 	}
742 
743 	/* Sync the portion of dma buffer for CPU to read. */
744 	dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr,
745 				      buf_state->page_info.page_offset,
746 				      buf_len, DMA_FROM_DEVICE);
747 
748 	/* Append to current skb if one exists. */
749 	if (rx->ctx.skb_head) {
750 		if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx,
751 						 priv)) != 0) {
752 			goto error;
753 		}
754 		return 0;
755 	}
756 
757 	if (eop && buf_len <= priv->rx_copybreak) {
758 		rx->ctx.skb_head = gve_rx_copy(priv->dev, napi,
759 					       &buf_state->page_info, buf_len);
760 		if (unlikely(!rx->ctx.skb_head))
761 			goto error;
762 		rx->ctx.skb_tail = rx->ctx.skb_head;
763 
764 		u64_stats_update_begin(&rx->statss);
765 		rx->rx_copied_pkt++;
766 		rx->rx_copybreak_pkt++;
767 		u64_stats_update_end(&rx->statss);
768 
769 		gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states,
770 				      buf_state);
771 		return 0;
772 	}
773 
774 	rx->ctx.skb_head = napi_get_frags(napi);
775 	if (unlikely(!rx->ctx.skb_head))
776 		goto error;
777 	rx->ctx.skb_tail = rx->ctx.skb_head;
778 
779 	if (gve_rx_should_trigger_copy_ondemand(rx)) {
780 		if (gve_rx_copy_ondemand(rx, buf_state, buf_len) < 0)
781 			goto error;
782 		return 0;
783 	}
784 
785 	skb_add_rx_frag(rx->ctx.skb_head, 0, buf_state->page_info.page,
786 			buf_state->page_info.page_offset, buf_len,
787 			priv->data_buffer_size_dqo);
788 	gve_dec_pagecnt_bias(&buf_state->page_info);
789 
790 	gve_try_recycle_buf(priv, rx, buf_state);
791 	return 0;
792 
793 error:
794 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
795 	return -ENOMEM;
796 }
797 
798 static int gve_rx_complete_rsc(struct sk_buff *skb,
799 			       const struct gve_rx_compl_desc_dqo *desc,
800 			       struct gve_ptype ptype)
801 {
802 	struct skb_shared_info *shinfo = skb_shinfo(skb);
803 
804 	/* Only TCP is supported right now. */
805 	if (ptype.l4_type != GVE_L4_TYPE_TCP)
806 		return -EINVAL;
807 
808 	switch (ptype.l3_type) {
809 	case GVE_L3_TYPE_IPV4:
810 		shinfo->gso_type = SKB_GSO_TCPV4;
811 		break;
812 	case GVE_L3_TYPE_IPV6:
813 		shinfo->gso_type = SKB_GSO_TCPV6;
814 		break;
815 	default:
816 		return -EINVAL;
817 	}
818 
819 	shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len);
820 	return 0;
821 }
822 
823 /* Returns 0 if skb is completed successfully, -1 otherwise. */
824 static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi,
825 			       const struct gve_rx_compl_desc_dqo *desc,
826 			       netdev_features_t feat)
827 {
828 	struct gve_ptype ptype =
829 		rx->gve->ptype_lut_dqo->ptypes[desc->packet_type];
830 	int err;
831 
832 	skb_record_rx_queue(rx->ctx.skb_head, rx->q_num);
833 
834 	if (feat & NETIF_F_RXHASH)
835 		gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype);
836 
837 	if (feat & NETIF_F_RXCSUM)
838 		gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype);
839 
840 	/* RSC packets must set gso_size otherwise the TCP stack will complain
841 	 * that packets are larger than MTU.
842 	 */
843 	if (desc->rsc) {
844 		err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype);
845 		if (err < 0)
846 			return err;
847 	}
848 
849 	if (skb_headlen(rx->ctx.skb_head) == 0)
850 		napi_gro_frags(napi);
851 	else
852 		napi_gro_receive(napi, rx->ctx.skb_head);
853 
854 	return 0;
855 }
856 
857 int gve_rx_poll_dqo(struct gve_notify_block *block, int budget)
858 {
859 	struct napi_struct *napi = &block->napi;
860 	netdev_features_t feat = napi->dev->features;
861 
862 	struct gve_rx_ring *rx = block->rx;
863 	struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
864 
865 	u32 work_done = 0;
866 	u64 bytes = 0;
867 	int err;
868 
869 	while (work_done < budget) {
870 		struct gve_rx_compl_desc_dqo *compl_desc =
871 			&complq->desc_ring[complq->head];
872 		u32 pkt_bytes;
873 
874 		/* No more new packets */
875 		if (compl_desc->generation == complq->cur_gen_bit)
876 			break;
877 
878 		/* Prefetch the next two descriptors. */
879 		prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]);
880 		prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]);
881 
882 		/* Do not read data until we own the descriptor */
883 		dma_rmb();
884 
885 		err = gve_rx_dqo(napi, rx, compl_desc, complq->head, rx->q_num);
886 		if (err < 0) {
887 			gve_rx_free_skb(rx);
888 			u64_stats_update_begin(&rx->statss);
889 			if (err == -ENOMEM)
890 				rx->rx_skb_alloc_fail++;
891 			else if (err == -EINVAL)
892 				rx->rx_desc_err_dropped_pkt++;
893 			u64_stats_update_end(&rx->statss);
894 		}
895 
896 		complq->head = (complq->head + 1) & complq->mask;
897 		complq->num_free_slots++;
898 
899 		/* When the ring wraps, the generation bit is flipped. */
900 		complq->cur_gen_bit ^= (complq->head == 0);
901 
902 		/* Receiving a completion means we have space to post another
903 		 * buffer on the buffer queue.
904 		 */
905 		{
906 			struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
907 
908 			bufq->head = (bufq->head + 1) & bufq->mask;
909 		}
910 
911 		/* Free running counter of completed descriptors */
912 		rx->cnt++;
913 
914 		if (!rx->ctx.skb_head)
915 			continue;
916 
917 		if (!compl_desc->end_of_packet)
918 			continue;
919 
920 		work_done++;
921 		pkt_bytes = rx->ctx.skb_head->len;
922 		/* The ethernet header (first ETH_HLEN bytes) is snipped off
923 		 * by eth_type_trans.
924 		 */
925 		if (skb_headlen(rx->ctx.skb_head))
926 			pkt_bytes += ETH_HLEN;
927 
928 		/* gve_rx_complete_skb() will consume skb if successful */
929 		if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) {
930 			gve_rx_free_skb(rx);
931 			u64_stats_update_begin(&rx->statss);
932 			rx->rx_desc_err_dropped_pkt++;
933 			u64_stats_update_end(&rx->statss);
934 			continue;
935 		}
936 
937 		bytes += pkt_bytes;
938 		rx->ctx.skb_head = NULL;
939 		rx->ctx.skb_tail = NULL;
940 	}
941 
942 	gve_rx_post_buffers_dqo(rx);
943 
944 	u64_stats_update_begin(&rx->statss);
945 	rx->rpackets += work_done;
946 	rx->rbytes += bytes;
947 	u64_stats_update_end(&rx->statss);
948 
949 	return work_done;
950 }
951