xref: /linux/drivers/net/ethernet/google/gve/gve_rx_dqo.c (revision add452d09a38c7a7c44aea55c1015392cebf9fa7)
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6 
7 #include "gve.h"
8 #include "gve_dqo.h"
9 #include "gve_adminq.h"
10 #include "gve_utils.h"
11 #include <linux/ip.h>
12 #include <linux/ipv6.h>
13 #include <linux/skbuff.h>
14 #include <linux/slab.h>
15 #include <net/ip6_checksum.h>
16 #include <net/ipv6.h>
17 #include <net/tcp.h>
18 
19 static int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs)
20 {
21 	return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias;
22 }
23 
24 static void gve_free_page_dqo(struct gve_priv *priv,
25 			      struct gve_rx_buf_state_dqo *bs,
26 			      bool free_page)
27 {
28 	page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1);
29 	if (free_page)
30 		gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr,
31 			      DMA_FROM_DEVICE);
32 	bs->page_info.page = NULL;
33 }
34 
35 static struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx)
36 {
37 	struct gve_rx_buf_state_dqo *buf_state;
38 	s16 buffer_id;
39 
40 	buffer_id = rx->dqo.free_buf_states;
41 	if (unlikely(buffer_id == -1))
42 		return NULL;
43 
44 	buf_state = &rx->dqo.buf_states[buffer_id];
45 
46 	/* Remove buf_state from free list */
47 	rx->dqo.free_buf_states = buf_state->next;
48 
49 	/* Point buf_state to itself to mark it as allocated */
50 	buf_state->next = buffer_id;
51 
52 	return buf_state;
53 }
54 
55 static bool gve_buf_state_is_allocated(struct gve_rx_ring *rx,
56 				       struct gve_rx_buf_state_dqo *buf_state)
57 {
58 	s16 buffer_id = buf_state - rx->dqo.buf_states;
59 
60 	return buf_state->next == buffer_id;
61 }
62 
63 static void gve_free_buf_state(struct gve_rx_ring *rx,
64 			       struct gve_rx_buf_state_dqo *buf_state)
65 {
66 	s16 buffer_id = buf_state - rx->dqo.buf_states;
67 
68 	buf_state->next = rx->dqo.free_buf_states;
69 	rx->dqo.free_buf_states = buffer_id;
70 }
71 
72 static struct gve_rx_buf_state_dqo *
73 gve_dequeue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list)
74 {
75 	struct gve_rx_buf_state_dqo *buf_state;
76 	s16 buffer_id;
77 
78 	buffer_id = list->head;
79 	if (unlikely(buffer_id == -1))
80 		return NULL;
81 
82 	buf_state = &rx->dqo.buf_states[buffer_id];
83 
84 	/* Remove buf_state from list */
85 	list->head = buf_state->next;
86 	if (buf_state->next == -1)
87 		list->tail = -1;
88 
89 	/* Point buf_state to itself to mark it as allocated */
90 	buf_state->next = buffer_id;
91 
92 	return buf_state;
93 }
94 
95 static void gve_enqueue_buf_state(struct gve_rx_ring *rx,
96 				  struct gve_index_list *list,
97 				  struct gve_rx_buf_state_dqo *buf_state)
98 {
99 	s16 buffer_id = buf_state - rx->dqo.buf_states;
100 
101 	buf_state->next = -1;
102 
103 	if (list->head == -1) {
104 		list->head = buffer_id;
105 		list->tail = buffer_id;
106 	} else {
107 		int tail = list->tail;
108 
109 		rx->dqo.buf_states[tail].next = buffer_id;
110 		list->tail = buffer_id;
111 	}
112 }
113 
114 static struct gve_rx_buf_state_dqo *
115 gve_get_recycled_buf_state(struct gve_rx_ring *rx)
116 {
117 	struct gve_rx_buf_state_dqo *buf_state;
118 	int i;
119 
120 	/* Recycled buf states are immediately usable. */
121 	buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states);
122 	if (likely(buf_state))
123 		return buf_state;
124 
125 	if (unlikely(rx->dqo.used_buf_states.head == -1))
126 		return NULL;
127 
128 	/* Used buf states are only usable when ref count reaches 0, which means
129 	 * no SKBs refer to them.
130 	 *
131 	 * Search a limited number before giving up.
132 	 */
133 	for (i = 0; i < 5; i++) {
134 		buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states);
135 		if (gve_buf_ref_cnt(buf_state) == 0) {
136 			rx->dqo.used_buf_states_cnt--;
137 			return buf_state;
138 		}
139 
140 		gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state);
141 	}
142 
143 	/* For QPL, we cannot allocate any new buffers and must
144 	 * wait for the existing ones to be available.
145 	 */
146 	if (rx->dqo.qpl)
147 		return NULL;
148 
149 	/* If there are no free buf states discard an entry from
150 	 * `used_buf_states` so it can be used.
151 	 */
152 	if (unlikely(rx->dqo.free_buf_states == -1)) {
153 		buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states);
154 		if (gve_buf_ref_cnt(buf_state) == 0)
155 			return buf_state;
156 
157 		gve_free_page_dqo(rx->gve, buf_state, true);
158 		gve_free_buf_state(rx, buf_state);
159 	}
160 
161 	return NULL;
162 }
163 
164 static int gve_alloc_page_dqo(struct gve_rx_ring *rx,
165 			      struct gve_rx_buf_state_dqo *buf_state)
166 {
167 	struct gve_priv *priv = rx->gve;
168 	u32 idx;
169 
170 	if (!rx->dqo.qpl) {
171 		int err;
172 
173 		err = gve_alloc_page(priv, &priv->pdev->dev,
174 				     &buf_state->page_info.page,
175 				     &buf_state->addr,
176 				     DMA_FROM_DEVICE, GFP_ATOMIC);
177 		if (err)
178 			return err;
179 	} else {
180 		idx = rx->dqo.next_qpl_page_idx;
181 		if (idx >= gve_get_rx_pages_per_qpl_dqo(priv->rx_desc_cnt)) {
182 			net_err_ratelimited("%s: Out of QPL pages\n",
183 					    priv->dev->name);
184 			return -ENOMEM;
185 		}
186 		buf_state->page_info.page = rx->dqo.qpl->pages[idx];
187 		buf_state->addr = rx->dqo.qpl->page_buses[idx];
188 		rx->dqo.next_qpl_page_idx++;
189 	}
190 	buf_state->page_info.page_offset = 0;
191 	buf_state->page_info.page_address =
192 		page_address(buf_state->page_info.page);
193 	buf_state->last_single_ref_offset = 0;
194 
195 	/* The page already has 1 ref. */
196 	page_ref_add(buf_state->page_info.page, INT_MAX - 1);
197 	buf_state->page_info.pagecnt_bias = INT_MAX;
198 
199 	return 0;
200 }
201 
202 static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx)
203 {
204 	struct device *hdev = &priv->pdev->dev;
205 	int buf_count = rx->dqo.bufq.mask + 1;
206 
207 	if (rx->dqo.hdr_bufs.data) {
208 		dma_free_coherent(hdev, priv->header_buf_size * buf_count,
209 				  rx->dqo.hdr_bufs.data, rx->dqo.hdr_bufs.addr);
210 		rx->dqo.hdr_bufs.data = NULL;
211 	}
212 }
213 
214 static void gve_rx_init_ring_state_dqo(struct gve_rx_ring *rx,
215 				       const u32 buffer_queue_slots,
216 				       const u32 completion_queue_slots)
217 {
218 	int i;
219 
220 	/* Set buffer queue state */
221 	rx->dqo.bufq.mask = buffer_queue_slots - 1;
222 	rx->dqo.bufq.head = 0;
223 	rx->dqo.bufq.tail = 0;
224 
225 	/* Set completion queue state */
226 	rx->dqo.complq.num_free_slots = completion_queue_slots;
227 	rx->dqo.complq.mask = completion_queue_slots - 1;
228 	rx->dqo.complq.cur_gen_bit = 0;
229 	rx->dqo.complq.head = 0;
230 
231 	/* Set RX SKB context */
232 	rx->ctx.skb_head = NULL;
233 	rx->ctx.skb_tail = NULL;
234 
235 	/* Set up linked list of buffer IDs */
236 	if (rx->dqo.buf_states) {
237 		for (i = 0; i < rx->dqo.num_buf_states - 1; i++)
238 			rx->dqo.buf_states[i].next = i + 1;
239 		rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1;
240 	}
241 
242 	rx->dqo.free_buf_states = 0;
243 	rx->dqo.recycled_buf_states.head = -1;
244 	rx->dqo.recycled_buf_states.tail = -1;
245 	rx->dqo.used_buf_states.head = -1;
246 	rx->dqo.used_buf_states.tail = -1;
247 }
248 
249 static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx)
250 {
251 	struct gve_rx_ring *rx = &priv->rx[idx];
252 	size_t size;
253 	int i;
254 
255 	const u32 buffer_queue_slots = priv->rx_desc_cnt;
256 	const u32 completion_queue_slots = priv->rx_desc_cnt;
257 
258 	/* Reset buffer queue */
259 	if (rx->dqo.bufq.desc_ring) {
260 		size = sizeof(rx->dqo.bufq.desc_ring[0]) *
261 			buffer_queue_slots;
262 		memset(rx->dqo.bufq.desc_ring, 0, size);
263 	}
264 
265 	/* Reset completion queue */
266 	if (rx->dqo.complq.desc_ring) {
267 		size = sizeof(rx->dqo.complq.desc_ring[0]) *
268 			completion_queue_slots;
269 		memset(rx->dqo.complq.desc_ring, 0, size);
270 	}
271 
272 	/* Reset q_resources */
273 	if (rx->q_resources)
274 		memset(rx->q_resources, 0, sizeof(*rx->q_resources));
275 
276 	/* Reset buf states */
277 	if (rx->dqo.buf_states) {
278 		for (i = 0; i < rx->dqo.num_buf_states; i++) {
279 			struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
280 
281 			if (bs->page_info.page)
282 				gve_free_page_dqo(priv, bs, !rx->dqo.qpl);
283 		}
284 	}
285 
286 	gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
287 				   completion_queue_slots);
288 }
289 
290 void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx)
291 {
292 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
293 
294 	if (!gve_rx_was_added_to_block(priv, idx))
295 		return;
296 
297 	gve_remove_napi(priv, ntfy_idx);
298 	gve_rx_remove_from_block(priv, idx);
299 	gve_rx_reset_ring_dqo(priv, idx);
300 }
301 
302 void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
303 			  struct gve_rx_alloc_rings_cfg *cfg)
304 {
305 	struct device *hdev = &priv->pdev->dev;
306 	size_t completion_queue_slots;
307 	size_t buffer_queue_slots;
308 	int idx = rx->q_num;
309 	size_t size;
310 	u32 qpl_id;
311 	int i;
312 
313 	completion_queue_slots = rx->dqo.complq.mask + 1;
314 	buffer_queue_slots = rx->dqo.bufq.mask + 1;
315 
316 	if (rx->q_resources) {
317 		dma_free_coherent(hdev, sizeof(*rx->q_resources),
318 				  rx->q_resources, rx->q_resources_bus);
319 		rx->q_resources = NULL;
320 	}
321 
322 	for (i = 0; i < rx->dqo.num_buf_states; i++) {
323 		struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
324 		/* Only free page for RDA. QPL pages are freed in gve_main. */
325 		if (bs->page_info.page)
326 			gve_free_page_dqo(priv, bs, !rx->dqo.qpl);
327 	}
328 
329 	if (rx->dqo.qpl) {
330 		qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
331 		gve_free_queue_page_list(priv, rx->dqo.qpl, qpl_id);
332 		rx->dqo.qpl = NULL;
333 	}
334 
335 	if (rx->dqo.bufq.desc_ring) {
336 		size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
337 		dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring,
338 				  rx->dqo.bufq.bus);
339 		rx->dqo.bufq.desc_ring = NULL;
340 	}
341 
342 	if (rx->dqo.complq.desc_ring) {
343 		size = sizeof(rx->dqo.complq.desc_ring[0]) *
344 			completion_queue_slots;
345 		dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring,
346 				  rx->dqo.complq.bus);
347 		rx->dqo.complq.desc_ring = NULL;
348 	}
349 
350 	kvfree(rx->dqo.buf_states);
351 	rx->dqo.buf_states = NULL;
352 
353 	gve_rx_free_hdr_bufs(priv, rx);
354 
355 	netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
356 }
357 
358 static int gve_rx_alloc_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx,
359 				 const u32 buf_count)
360 {
361 	struct device *hdev = &priv->pdev->dev;
362 
363 	rx->dqo.hdr_bufs.data = dma_alloc_coherent(hdev, priv->header_buf_size * buf_count,
364 						   &rx->dqo.hdr_bufs.addr, GFP_KERNEL);
365 	if (!rx->dqo.hdr_bufs.data)
366 		return -ENOMEM;
367 
368 	return 0;
369 }
370 
371 void gve_rx_start_ring_dqo(struct gve_priv *priv, int idx)
372 {
373 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
374 
375 	gve_rx_add_to_block(priv, idx);
376 	gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo);
377 }
378 
379 int gve_rx_alloc_ring_dqo(struct gve_priv *priv,
380 			  struct gve_rx_alloc_rings_cfg *cfg,
381 			  struct gve_rx_ring *rx,
382 			  int idx)
383 {
384 	struct device *hdev = &priv->pdev->dev;
385 	int qpl_page_cnt;
386 	size_t size;
387 	u32 qpl_id;
388 
389 	const u32 buffer_queue_slots = cfg->ring_size;
390 	const u32 completion_queue_slots = cfg->ring_size;
391 
392 	netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n");
393 
394 	memset(rx, 0, sizeof(*rx));
395 	rx->gve = priv;
396 	rx->q_num = idx;
397 
398 	rx->dqo.num_buf_states = cfg->raw_addressing ?
399 		min_t(s16, S16_MAX, buffer_queue_slots * 4) :
400 		gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
401 	rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states,
402 				      sizeof(rx->dqo.buf_states[0]),
403 				      GFP_KERNEL);
404 	if (!rx->dqo.buf_states)
405 		return -ENOMEM;
406 
407 	/* Allocate header buffers for header-split */
408 	if (cfg->enable_header_split)
409 		if (gve_rx_alloc_hdr_bufs(priv, rx, buffer_queue_slots))
410 			goto err;
411 
412 	/* Allocate RX completion queue */
413 	size = sizeof(rx->dqo.complq.desc_ring[0]) *
414 		completion_queue_slots;
415 	rx->dqo.complq.desc_ring =
416 		dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL);
417 	if (!rx->dqo.complq.desc_ring)
418 		goto err;
419 
420 	/* Allocate RX buffer queue */
421 	size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
422 	rx->dqo.bufq.desc_ring =
423 		dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL);
424 	if (!rx->dqo.bufq.desc_ring)
425 		goto err;
426 
427 	if (!cfg->raw_addressing) {
428 		qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
429 		qpl_page_cnt = gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
430 
431 		rx->dqo.qpl = gve_alloc_queue_page_list(priv, qpl_id,
432 							qpl_page_cnt);
433 		if (!rx->dqo.qpl)
434 			goto err;
435 		rx->dqo.next_qpl_page_idx = 0;
436 	}
437 
438 	rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources),
439 					     &rx->q_resources_bus, GFP_KERNEL);
440 	if (!rx->q_resources)
441 		goto err;
442 
443 	gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
444 				   completion_queue_slots);
445 
446 	return 0;
447 
448 err:
449 	gve_rx_free_ring_dqo(priv, rx, cfg);
450 	return -ENOMEM;
451 }
452 
453 void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx)
454 {
455 	const struct gve_rx_ring *rx = &priv->rx[queue_idx];
456 	u64 index = be32_to_cpu(rx->q_resources->db_index);
457 
458 	iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]);
459 }
460 
461 int gve_rx_alloc_rings_dqo(struct gve_priv *priv,
462 			   struct gve_rx_alloc_rings_cfg *cfg)
463 {
464 	struct gve_rx_ring *rx;
465 	int err;
466 	int i;
467 
468 	rx = kvcalloc(cfg->qcfg->max_queues, sizeof(struct gve_rx_ring),
469 		      GFP_KERNEL);
470 	if (!rx)
471 		return -ENOMEM;
472 
473 	for (i = 0; i < cfg->qcfg->num_queues; i++) {
474 		err = gve_rx_alloc_ring_dqo(priv, cfg, &rx[i], i);
475 		if (err) {
476 			netif_err(priv, drv, priv->dev,
477 				  "Failed to alloc rx ring=%d: err=%d\n",
478 				  i, err);
479 			goto err;
480 		}
481 	}
482 
483 	cfg->rx = rx;
484 	return 0;
485 
486 err:
487 	for (i--; i >= 0; i--)
488 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
489 	kvfree(rx);
490 	return err;
491 }
492 
493 void gve_rx_free_rings_dqo(struct gve_priv *priv,
494 			   struct gve_rx_alloc_rings_cfg *cfg)
495 {
496 	struct gve_rx_ring *rx = cfg->rx;
497 	int i;
498 
499 	if (!rx)
500 		return;
501 
502 	for (i = 0; i < cfg->qcfg->num_queues;  i++)
503 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
504 
505 	kvfree(rx);
506 	cfg->rx = NULL;
507 }
508 
509 void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx)
510 {
511 	struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
512 	struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
513 	struct gve_priv *priv = rx->gve;
514 	u32 num_avail_slots;
515 	u32 num_full_slots;
516 	u32 num_posted = 0;
517 
518 	num_full_slots = (bufq->tail - bufq->head) & bufq->mask;
519 	num_avail_slots = bufq->mask - num_full_slots;
520 
521 	num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots);
522 	while (num_posted < num_avail_slots) {
523 		struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail];
524 		struct gve_rx_buf_state_dqo *buf_state;
525 
526 		buf_state = gve_get_recycled_buf_state(rx);
527 		if (unlikely(!buf_state)) {
528 			buf_state = gve_alloc_buf_state(rx);
529 			if (unlikely(!buf_state))
530 				break;
531 
532 			if (unlikely(gve_alloc_page_dqo(rx, buf_state))) {
533 				u64_stats_update_begin(&rx->statss);
534 				rx->rx_buf_alloc_fail++;
535 				u64_stats_update_end(&rx->statss);
536 				gve_free_buf_state(rx, buf_state);
537 				break;
538 			}
539 		}
540 
541 		desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states);
542 		desc->buf_addr = cpu_to_le64(buf_state->addr +
543 					     buf_state->page_info.page_offset);
544 		if (rx->dqo.hdr_bufs.data)
545 			desc->header_buf_addr =
546 				cpu_to_le64(rx->dqo.hdr_bufs.addr +
547 					    priv->header_buf_size * bufq->tail);
548 
549 		bufq->tail = (bufq->tail + 1) & bufq->mask;
550 		complq->num_free_slots--;
551 		num_posted++;
552 
553 		if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0)
554 			gve_rx_write_doorbell_dqo(priv, rx->q_num);
555 	}
556 
557 	rx->fill_cnt += num_posted;
558 }
559 
560 static void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx,
561 				struct gve_rx_buf_state_dqo *buf_state)
562 {
563 	const u16 data_buffer_size = priv->data_buffer_size_dqo;
564 	int pagecount;
565 
566 	/* Can't reuse if we only fit one buffer per page */
567 	if (data_buffer_size * 2 > PAGE_SIZE)
568 		goto mark_used;
569 
570 	pagecount = gve_buf_ref_cnt(buf_state);
571 
572 	/* Record the offset when we have a single remaining reference.
573 	 *
574 	 * When this happens, we know all of the other offsets of the page are
575 	 * usable.
576 	 */
577 	if (pagecount == 1) {
578 		buf_state->last_single_ref_offset =
579 			buf_state->page_info.page_offset;
580 	}
581 
582 	/* Use the next buffer sized chunk in the page. */
583 	buf_state->page_info.page_offset += data_buffer_size;
584 	buf_state->page_info.page_offset &= (PAGE_SIZE - 1);
585 
586 	/* If we wrap around to the same offset without ever dropping to 1
587 	 * reference, then we don't know if this offset was ever freed.
588 	 */
589 	if (buf_state->page_info.page_offset ==
590 	    buf_state->last_single_ref_offset) {
591 		goto mark_used;
592 	}
593 
594 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
595 	return;
596 
597 mark_used:
598 	gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state);
599 	rx->dqo.used_buf_states_cnt++;
600 }
601 
602 static void gve_rx_skb_csum(struct sk_buff *skb,
603 			    const struct gve_rx_compl_desc_dqo *desc,
604 			    struct gve_ptype ptype)
605 {
606 	skb->ip_summed = CHECKSUM_NONE;
607 
608 	/* HW did not identify and process L3 and L4 headers. */
609 	if (unlikely(!desc->l3_l4_processed))
610 		return;
611 
612 	if (ptype.l3_type == GVE_L3_TYPE_IPV4) {
613 		if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err))
614 			return;
615 	} else if (ptype.l3_type == GVE_L3_TYPE_IPV6) {
616 		/* Checksum should be skipped if this flag is set. */
617 		if (unlikely(desc->ipv6_ex_add))
618 			return;
619 	}
620 
621 	if (unlikely(desc->csum_l4_err))
622 		return;
623 
624 	switch (ptype.l4_type) {
625 	case GVE_L4_TYPE_TCP:
626 	case GVE_L4_TYPE_UDP:
627 	case GVE_L4_TYPE_ICMP:
628 	case GVE_L4_TYPE_SCTP:
629 		skb->ip_summed = CHECKSUM_UNNECESSARY;
630 		break;
631 	default:
632 		break;
633 	}
634 }
635 
636 static void gve_rx_skb_hash(struct sk_buff *skb,
637 			    const struct gve_rx_compl_desc_dqo *compl_desc,
638 			    struct gve_ptype ptype)
639 {
640 	enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2;
641 
642 	if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN)
643 		hash_type = PKT_HASH_TYPE_L4;
644 	else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN)
645 		hash_type = PKT_HASH_TYPE_L3;
646 
647 	skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type);
648 }
649 
650 static void gve_rx_free_skb(struct napi_struct *napi, struct gve_rx_ring *rx)
651 {
652 	if (!rx->ctx.skb_head)
653 		return;
654 
655 	if (rx->ctx.skb_head == napi->skb)
656 		napi->skb = NULL;
657 	dev_kfree_skb_any(rx->ctx.skb_head);
658 	rx->ctx.skb_head = NULL;
659 	rx->ctx.skb_tail = NULL;
660 }
661 
662 static bool gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring *rx)
663 {
664 	if (!rx->dqo.qpl)
665 		return false;
666 	if (rx->dqo.used_buf_states_cnt <
667 		     (rx->dqo.num_buf_states -
668 		     GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD))
669 		return false;
670 	return true;
671 }
672 
673 static int gve_rx_copy_ondemand(struct gve_rx_ring *rx,
674 				struct gve_rx_buf_state_dqo *buf_state,
675 				u16 buf_len)
676 {
677 	struct page *page = alloc_page(GFP_ATOMIC);
678 	int num_frags;
679 
680 	if (!page)
681 		return -ENOMEM;
682 
683 	memcpy(page_address(page),
684 	       buf_state->page_info.page_address +
685 	       buf_state->page_info.page_offset,
686 	       buf_len);
687 	num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
688 	skb_add_rx_frag(rx->ctx.skb_tail, num_frags, page,
689 			0, buf_len, PAGE_SIZE);
690 
691 	u64_stats_update_begin(&rx->statss);
692 	rx->rx_frag_alloc_cnt++;
693 	u64_stats_update_end(&rx->statss);
694 	/* Return unused buffer. */
695 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
696 	return 0;
697 }
698 
699 /* Chains multi skbs for single rx packet.
700  * Returns 0 if buffer is appended, -1 otherwise.
701  */
702 static int gve_rx_append_frags(struct napi_struct *napi,
703 			       struct gve_rx_buf_state_dqo *buf_state,
704 			       u16 buf_len, struct gve_rx_ring *rx,
705 			       struct gve_priv *priv)
706 {
707 	int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
708 
709 	if (unlikely(num_frags == MAX_SKB_FRAGS)) {
710 		struct sk_buff *skb;
711 
712 		skb = napi_alloc_skb(napi, 0);
713 		if (!skb)
714 			return -1;
715 
716 		if (rx->ctx.skb_tail == rx->ctx.skb_head)
717 			skb_shinfo(rx->ctx.skb_head)->frag_list = skb;
718 		else
719 			rx->ctx.skb_tail->next = skb;
720 		rx->ctx.skb_tail = skb;
721 		num_frags = 0;
722 	}
723 	if (rx->ctx.skb_tail != rx->ctx.skb_head) {
724 		rx->ctx.skb_head->len += buf_len;
725 		rx->ctx.skb_head->data_len += buf_len;
726 		rx->ctx.skb_head->truesize += priv->data_buffer_size_dqo;
727 	}
728 
729 	/* Trigger ondemand page allocation if we are running low on buffers */
730 	if (gve_rx_should_trigger_copy_ondemand(rx))
731 		return gve_rx_copy_ondemand(rx, buf_state, buf_len);
732 
733 	skb_add_rx_frag(rx->ctx.skb_tail, num_frags,
734 			buf_state->page_info.page,
735 			buf_state->page_info.page_offset,
736 			buf_len, priv->data_buffer_size_dqo);
737 	gve_dec_pagecnt_bias(&buf_state->page_info);
738 
739 	/* Advances buffer page-offset if page is partially used.
740 	 * Marks buffer as used if page is full.
741 	 */
742 	gve_try_recycle_buf(priv, rx, buf_state);
743 	return 0;
744 }
745 
746 /* Returns 0 if descriptor is completed successfully.
747  * Returns -EINVAL if descriptor is invalid.
748  * Returns -ENOMEM if data cannot be copied to skb.
749  */
750 static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
751 		      const struct gve_rx_compl_desc_dqo *compl_desc,
752 		      u32 desc_idx, int queue_idx)
753 {
754 	const u16 buffer_id = le16_to_cpu(compl_desc->buf_id);
755 	const bool hbo = compl_desc->header_buffer_overflow;
756 	const bool eop = compl_desc->end_of_packet != 0;
757 	const bool hsplit = compl_desc->split_header;
758 	struct gve_rx_buf_state_dqo *buf_state;
759 	struct gve_priv *priv = rx->gve;
760 	u16 buf_len;
761 	u16 hdr_len;
762 
763 	if (unlikely(buffer_id >= rx->dqo.num_buf_states)) {
764 		net_err_ratelimited("%s: Invalid RX buffer_id=%u\n",
765 				    priv->dev->name, buffer_id);
766 		return -EINVAL;
767 	}
768 	buf_state = &rx->dqo.buf_states[buffer_id];
769 	if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) {
770 		net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n",
771 				    priv->dev->name, buffer_id);
772 		return -EINVAL;
773 	}
774 
775 	if (unlikely(compl_desc->rx_error)) {
776 		gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states,
777 				      buf_state);
778 		return -EINVAL;
779 	}
780 
781 	buf_len = compl_desc->packet_len;
782 	hdr_len = compl_desc->header_len;
783 
784 	/* Page might have not been used for awhile and was likely last written
785 	 * by a different thread.
786 	 */
787 	prefetch(buf_state->page_info.page);
788 
789 	/* Copy the header into the skb in the case of header split */
790 	if (hsplit) {
791 		int unsplit = 0;
792 
793 		if (hdr_len && !hbo) {
794 			rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
795 							    rx->dqo.hdr_bufs.data +
796 							    desc_idx * priv->header_buf_size,
797 							    hdr_len);
798 			if (unlikely(!rx->ctx.skb_head))
799 				goto error;
800 			rx->ctx.skb_tail = rx->ctx.skb_head;
801 		} else {
802 			unsplit = 1;
803 		}
804 		u64_stats_update_begin(&rx->statss);
805 		rx->rx_hsplit_pkt++;
806 		rx->rx_hsplit_unsplit_pkt += unsplit;
807 		rx->rx_hsplit_bytes += hdr_len;
808 		u64_stats_update_end(&rx->statss);
809 	}
810 
811 	/* Sync the portion of dma buffer for CPU to read. */
812 	dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr,
813 				      buf_state->page_info.page_offset,
814 				      buf_len, DMA_FROM_DEVICE);
815 
816 	/* Append to current skb if one exists. */
817 	if (rx->ctx.skb_head) {
818 		if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx,
819 						 priv)) != 0) {
820 			goto error;
821 		}
822 		return 0;
823 	}
824 
825 	if (eop && buf_len <= priv->rx_copybreak) {
826 		rx->ctx.skb_head = gve_rx_copy(priv->dev, napi,
827 					       &buf_state->page_info, buf_len);
828 		if (unlikely(!rx->ctx.skb_head))
829 			goto error;
830 		rx->ctx.skb_tail = rx->ctx.skb_head;
831 
832 		u64_stats_update_begin(&rx->statss);
833 		rx->rx_copied_pkt++;
834 		rx->rx_copybreak_pkt++;
835 		u64_stats_update_end(&rx->statss);
836 
837 		gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states,
838 				      buf_state);
839 		return 0;
840 	}
841 
842 	rx->ctx.skb_head = napi_get_frags(napi);
843 	if (unlikely(!rx->ctx.skb_head))
844 		goto error;
845 	rx->ctx.skb_tail = rx->ctx.skb_head;
846 
847 	if (gve_rx_should_trigger_copy_ondemand(rx)) {
848 		if (gve_rx_copy_ondemand(rx, buf_state, buf_len) < 0)
849 			goto error;
850 		return 0;
851 	}
852 
853 	skb_add_rx_frag(rx->ctx.skb_head, 0, buf_state->page_info.page,
854 			buf_state->page_info.page_offset, buf_len,
855 			priv->data_buffer_size_dqo);
856 	gve_dec_pagecnt_bias(&buf_state->page_info);
857 
858 	gve_try_recycle_buf(priv, rx, buf_state);
859 	return 0;
860 
861 error:
862 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
863 	return -ENOMEM;
864 }
865 
866 static int gve_rx_complete_rsc(struct sk_buff *skb,
867 			       const struct gve_rx_compl_desc_dqo *desc,
868 			       struct gve_ptype ptype)
869 {
870 	struct skb_shared_info *shinfo = skb_shinfo(skb);
871 
872 	/* Only TCP is supported right now. */
873 	if (ptype.l4_type != GVE_L4_TYPE_TCP)
874 		return -EINVAL;
875 
876 	switch (ptype.l3_type) {
877 	case GVE_L3_TYPE_IPV4:
878 		shinfo->gso_type = SKB_GSO_TCPV4;
879 		break;
880 	case GVE_L3_TYPE_IPV6:
881 		shinfo->gso_type = SKB_GSO_TCPV6;
882 		break;
883 	default:
884 		return -EINVAL;
885 	}
886 
887 	shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len);
888 	return 0;
889 }
890 
891 /* Returns 0 if skb is completed successfully, -1 otherwise. */
892 static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi,
893 			       const struct gve_rx_compl_desc_dqo *desc,
894 			       netdev_features_t feat)
895 {
896 	struct gve_ptype ptype =
897 		rx->gve->ptype_lut_dqo->ptypes[desc->packet_type];
898 	int err;
899 
900 	skb_record_rx_queue(rx->ctx.skb_head, rx->q_num);
901 
902 	if (feat & NETIF_F_RXHASH)
903 		gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype);
904 
905 	if (feat & NETIF_F_RXCSUM)
906 		gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype);
907 
908 	/* RSC packets must set gso_size otherwise the TCP stack will complain
909 	 * that packets are larger than MTU.
910 	 */
911 	if (desc->rsc) {
912 		err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype);
913 		if (err < 0)
914 			return err;
915 	}
916 
917 	if (skb_headlen(rx->ctx.skb_head) == 0)
918 		napi_gro_frags(napi);
919 	else
920 		napi_gro_receive(napi, rx->ctx.skb_head);
921 
922 	return 0;
923 }
924 
925 int gve_rx_poll_dqo(struct gve_notify_block *block, int budget)
926 {
927 	struct napi_struct *napi = &block->napi;
928 	netdev_features_t feat = napi->dev->features;
929 
930 	struct gve_rx_ring *rx = block->rx;
931 	struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
932 
933 	u32 work_done = 0;
934 	u64 bytes = 0;
935 	int err;
936 
937 	while (work_done < budget) {
938 		struct gve_rx_compl_desc_dqo *compl_desc =
939 			&complq->desc_ring[complq->head];
940 		u32 pkt_bytes;
941 
942 		/* No more new packets */
943 		if (compl_desc->generation == complq->cur_gen_bit)
944 			break;
945 
946 		/* Prefetch the next two descriptors. */
947 		prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]);
948 		prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]);
949 
950 		/* Do not read data until we own the descriptor */
951 		dma_rmb();
952 
953 		err = gve_rx_dqo(napi, rx, compl_desc, complq->head, rx->q_num);
954 		if (err < 0) {
955 			gve_rx_free_skb(napi, rx);
956 			u64_stats_update_begin(&rx->statss);
957 			if (err == -ENOMEM)
958 				rx->rx_skb_alloc_fail++;
959 			else if (err == -EINVAL)
960 				rx->rx_desc_err_dropped_pkt++;
961 			u64_stats_update_end(&rx->statss);
962 		}
963 
964 		complq->head = (complq->head + 1) & complq->mask;
965 		complq->num_free_slots++;
966 
967 		/* When the ring wraps, the generation bit is flipped. */
968 		complq->cur_gen_bit ^= (complq->head == 0);
969 
970 		/* Receiving a completion means we have space to post another
971 		 * buffer on the buffer queue.
972 		 */
973 		{
974 			struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
975 
976 			bufq->head = (bufq->head + 1) & bufq->mask;
977 		}
978 
979 		/* Free running counter of completed descriptors */
980 		rx->cnt++;
981 
982 		if (!rx->ctx.skb_head)
983 			continue;
984 
985 		if (!compl_desc->end_of_packet)
986 			continue;
987 
988 		work_done++;
989 		pkt_bytes = rx->ctx.skb_head->len;
990 		/* The ethernet header (first ETH_HLEN bytes) is snipped off
991 		 * by eth_type_trans.
992 		 */
993 		if (skb_headlen(rx->ctx.skb_head))
994 			pkt_bytes += ETH_HLEN;
995 
996 		/* gve_rx_complete_skb() will consume skb if successful */
997 		if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) {
998 			gve_rx_free_skb(napi, rx);
999 			u64_stats_update_begin(&rx->statss);
1000 			rx->rx_desc_err_dropped_pkt++;
1001 			u64_stats_update_end(&rx->statss);
1002 			continue;
1003 		}
1004 
1005 		bytes += pkt_bytes;
1006 		rx->ctx.skb_head = NULL;
1007 		rx->ctx.skb_tail = NULL;
1008 	}
1009 
1010 	gve_rx_post_buffers_dqo(rx);
1011 
1012 	u64_stats_update_begin(&rx->statss);
1013 	rx->rpackets += work_done;
1014 	rx->rbytes += bytes;
1015 	u64_stats_update_end(&rx->statss);
1016 
1017 	return work_done;
1018 }
1019