xref: /linux/drivers/net/ethernet/google/gve/gve_rx_dqo.c (revision 001821b0e79716c4e17c71d8e053a23599a7a508)
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6 
7 #include "gve.h"
8 #include "gve_dqo.h"
9 #include "gve_adminq.h"
10 #include "gve_utils.h"
11 #include <linux/ip.h>
12 #include <linux/ipv6.h>
13 #include <linux/skbuff.h>
14 #include <linux/slab.h>
15 #include <net/ip6_checksum.h>
16 #include <net/ipv6.h>
17 #include <net/tcp.h>
18 
19 static int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs)
20 {
21 	return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias;
22 }
23 
24 static void gve_free_page_dqo(struct gve_priv *priv,
25 			      struct gve_rx_buf_state_dqo *bs,
26 			      bool free_page)
27 {
28 	page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1);
29 	if (free_page)
30 		gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr,
31 			      DMA_FROM_DEVICE);
32 	bs->page_info.page = NULL;
33 }
34 
35 static struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx)
36 {
37 	struct gve_rx_buf_state_dqo *buf_state;
38 	s16 buffer_id;
39 
40 	buffer_id = rx->dqo.free_buf_states;
41 	if (unlikely(buffer_id == -1))
42 		return NULL;
43 
44 	buf_state = &rx->dqo.buf_states[buffer_id];
45 
46 	/* Remove buf_state from free list */
47 	rx->dqo.free_buf_states = buf_state->next;
48 
49 	/* Point buf_state to itself to mark it as allocated */
50 	buf_state->next = buffer_id;
51 
52 	return buf_state;
53 }
54 
55 static bool gve_buf_state_is_allocated(struct gve_rx_ring *rx,
56 				       struct gve_rx_buf_state_dqo *buf_state)
57 {
58 	s16 buffer_id = buf_state - rx->dqo.buf_states;
59 
60 	return buf_state->next == buffer_id;
61 }
62 
63 static void gve_free_buf_state(struct gve_rx_ring *rx,
64 			       struct gve_rx_buf_state_dqo *buf_state)
65 {
66 	s16 buffer_id = buf_state - rx->dqo.buf_states;
67 
68 	buf_state->next = rx->dqo.free_buf_states;
69 	rx->dqo.free_buf_states = buffer_id;
70 }
71 
72 static struct gve_rx_buf_state_dqo *
73 gve_dequeue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list)
74 {
75 	struct gve_rx_buf_state_dqo *buf_state;
76 	s16 buffer_id;
77 
78 	buffer_id = list->head;
79 	if (unlikely(buffer_id == -1))
80 		return NULL;
81 
82 	buf_state = &rx->dqo.buf_states[buffer_id];
83 
84 	/* Remove buf_state from list */
85 	list->head = buf_state->next;
86 	if (buf_state->next == -1)
87 		list->tail = -1;
88 
89 	/* Point buf_state to itself to mark it as allocated */
90 	buf_state->next = buffer_id;
91 
92 	return buf_state;
93 }
94 
95 static void gve_enqueue_buf_state(struct gve_rx_ring *rx,
96 				  struct gve_index_list *list,
97 				  struct gve_rx_buf_state_dqo *buf_state)
98 {
99 	s16 buffer_id = buf_state - rx->dqo.buf_states;
100 
101 	buf_state->next = -1;
102 
103 	if (list->head == -1) {
104 		list->head = buffer_id;
105 		list->tail = buffer_id;
106 	} else {
107 		int tail = list->tail;
108 
109 		rx->dqo.buf_states[tail].next = buffer_id;
110 		list->tail = buffer_id;
111 	}
112 }
113 
114 static struct gve_rx_buf_state_dqo *
115 gve_get_recycled_buf_state(struct gve_rx_ring *rx)
116 {
117 	struct gve_rx_buf_state_dqo *buf_state;
118 	int i;
119 
120 	/* Recycled buf states are immediately usable. */
121 	buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states);
122 	if (likely(buf_state))
123 		return buf_state;
124 
125 	if (unlikely(rx->dqo.used_buf_states.head == -1))
126 		return NULL;
127 
128 	/* Used buf states are only usable when ref count reaches 0, which means
129 	 * no SKBs refer to them.
130 	 *
131 	 * Search a limited number before giving up.
132 	 */
133 	for (i = 0; i < 5; i++) {
134 		buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states);
135 		if (gve_buf_ref_cnt(buf_state) == 0) {
136 			rx->dqo.used_buf_states_cnt--;
137 			return buf_state;
138 		}
139 
140 		gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state);
141 	}
142 
143 	/* For QPL, we cannot allocate any new buffers and must
144 	 * wait for the existing ones to be available.
145 	 */
146 	if (rx->dqo.qpl)
147 		return NULL;
148 
149 	/* If there are no free buf states discard an entry from
150 	 * `used_buf_states` so it can be used.
151 	 */
152 	if (unlikely(rx->dqo.free_buf_states == -1)) {
153 		buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states);
154 		if (gve_buf_ref_cnt(buf_state) == 0)
155 			return buf_state;
156 
157 		gve_free_page_dqo(rx->gve, buf_state, true);
158 		gve_free_buf_state(rx, buf_state);
159 	}
160 
161 	return NULL;
162 }
163 
164 static int gve_alloc_page_dqo(struct gve_rx_ring *rx,
165 			      struct gve_rx_buf_state_dqo *buf_state)
166 {
167 	struct gve_priv *priv = rx->gve;
168 	u32 idx;
169 
170 	if (!rx->dqo.qpl) {
171 		int err;
172 
173 		err = gve_alloc_page(priv, &priv->pdev->dev,
174 				     &buf_state->page_info.page,
175 				     &buf_state->addr,
176 				     DMA_FROM_DEVICE, GFP_ATOMIC);
177 		if (err)
178 			return err;
179 	} else {
180 		idx = rx->dqo.next_qpl_page_idx;
181 		if (idx >= gve_get_rx_pages_per_qpl_dqo(priv->rx_desc_cnt)) {
182 			net_err_ratelimited("%s: Out of QPL pages\n",
183 					    priv->dev->name);
184 			return -ENOMEM;
185 		}
186 		buf_state->page_info.page = rx->dqo.qpl->pages[idx];
187 		buf_state->addr = rx->dqo.qpl->page_buses[idx];
188 		rx->dqo.next_qpl_page_idx++;
189 	}
190 	buf_state->page_info.page_offset = 0;
191 	buf_state->page_info.page_address =
192 		page_address(buf_state->page_info.page);
193 	buf_state->last_single_ref_offset = 0;
194 
195 	/* The page already has 1 ref. */
196 	page_ref_add(buf_state->page_info.page, INT_MAX - 1);
197 	buf_state->page_info.pagecnt_bias = INT_MAX;
198 
199 	return 0;
200 }
201 
202 static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx)
203 {
204 	struct device *hdev = &priv->pdev->dev;
205 	int buf_count = rx->dqo.bufq.mask + 1;
206 
207 	if (rx->dqo.hdr_bufs.data) {
208 		dma_free_coherent(hdev, priv->header_buf_size * buf_count,
209 				  rx->dqo.hdr_bufs.data, rx->dqo.hdr_bufs.addr);
210 		rx->dqo.hdr_bufs.data = NULL;
211 	}
212 }
213 
214 static void gve_rx_init_ring_state_dqo(struct gve_rx_ring *rx,
215 				       const u32 buffer_queue_slots,
216 				       const u32 completion_queue_slots)
217 {
218 	int i;
219 
220 	/* Set buffer queue state */
221 	rx->dqo.bufq.mask = buffer_queue_slots - 1;
222 	rx->dqo.bufq.head = 0;
223 	rx->dqo.bufq.tail = 0;
224 
225 	/* Set completion queue state */
226 	rx->dqo.complq.num_free_slots = completion_queue_slots;
227 	rx->dqo.complq.mask = completion_queue_slots - 1;
228 	rx->dqo.complq.cur_gen_bit = 0;
229 	rx->dqo.complq.head = 0;
230 
231 	/* Set RX SKB context */
232 	rx->ctx.skb_head = NULL;
233 	rx->ctx.skb_tail = NULL;
234 
235 	/* Set up linked list of buffer IDs */
236 	if (rx->dqo.buf_states) {
237 		for (i = 0; i < rx->dqo.num_buf_states - 1; i++)
238 			rx->dqo.buf_states[i].next = i + 1;
239 		rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1;
240 	}
241 
242 	rx->dqo.free_buf_states = 0;
243 	rx->dqo.recycled_buf_states.head = -1;
244 	rx->dqo.recycled_buf_states.tail = -1;
245 	rx->dqo.used_buf_states.head = -1;
246 	rx->dqo.used_buf_states.tail = -1;
247 }
248 
249 static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx)
250 {
251 	struct gve_rx_ring *rx = &priv->rx[idx];
252 	size_t size;
253 	int i;
254 
255 	const u32 buffer_queue_slots = priv->rx_desc_cnt;
256 	const u32 completion_queue_slots = priv->rx_desc_cnt;
257 
258 	/* Reset buffer queue */
259 	if (rx->dqo.bufq.desc_ring) {
260 		size = sizeof(rx->dqo.bufq.desc_ring[0]) *
261 			buffer_queue_slots;
262 		memset(rx->dqo.bufq.desc_ring, 0, size);
263 	}
264 
265 	/* Reset completion queue */
266 	if (rx->dqo.complq.desc_ring) {
267 		size = sizeof(rx->dqo.complq.desc_ring[0]) *
268 			completion_queue_slots;
269 		memset(rx->dqo.complq.desc_ring, 0, size);
270 	}
271 
272 	/* Reset q_resources */
273 	if (rx->q_resources)
274 		memset(rx->q_resources, 0, sizeof(*rx->q_resources));
275 
276 	/* Reset buf states */
277 	if (rx->dqo.buf_states) {
278 		for (i = 0; i < rx->dqo.num_buf_states; i++) {
279 			struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
280 
281 			if (bs->page_info.page)
282 				gve_free_page_dqo(priv, bs, !rx->dqo.qpl);
283 		}
284 	}
285 
286 	gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
287 				   completion_queue_slots);
288 }
289 
290 void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx)
291 {
292 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
293 
294 	if (!gve_rx_was_added_to_block(priv, idx))
295 		return;
296 
297 	gve_remove_napi(priv, ntfy_idx);
298 	gve_rx_remove_from_block(priv, idx);
299 	gve_rx_reset_ring_dqo(priv, idx);
300 }
301 
302 void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
303 			  struct gve_rx_alloc_rings_cfg *cfg)
304 {
305 	struct device *hdev = &priv->pdev->dev;
306 	size_t completion_queue_slots;
307 	size_t buffer_queue_slots;
308 	int idx = rx->q_num;
309 	size_t size;
310 	u32 qpl_id;
311 	int i;
312 
313 	completion_queue_slots = rx->dqo.complq.mask + 1;
314 	buffer_queue_slots = rx->dqo.bufq.mask + 1;
315 
316 	if (rx->q_resources) {
317 		dma_free_coherent(hdev, sizeof(*rx->q_resources),
318 				  rx->q_resources, rx->q_resources_bus);
319 		rx->q_resources = NULL;
320 	}
321 
322 	for (i = 0; i < rx->dqo.num_buf_states; i++) {
323 		struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i];
324 		/* Only free page for RDA. QPL pages are freed in gve_main. */
325 		if (bs->page_info.page)
326 			gve_free_page_dqo(priv, bs, !rx->dqo.qpl);
327 	}
328 
329 	if (rx->dqo.qpl) {
330 		qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
331 		gve_free_queue_page_list(priv, rx->dqo.qpl, qpl_id);
332 		rx->dqo.qpl = NULL;
333 	}
334 
335 	if (rx->dqo.bufq.desc_ring) {
336 		size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
337 		dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring,
338 				  rx->dqo.bufq.bus);
339 		rx->dqo.bufq.desc_ring = NULL;
340 	}
341 
342 	if (rx->dqo.complq.desc_ring) {
343 		size = sizeof(rx->dqo.complq.desc_ring[0]) *
344 			completion_queue_slots;
345 		dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring,
346 				  rx->dqo.complq.bus);
347 		rx->dqo.complq.desc_ring = NULL;
348 	}
349 
350 	kvfree(rx->dqo.buf_states);
351 	rx->dqo.buf_states = NULL;
352 
353 	gve_rx_free_hdr_bufs(priv, rx);
354 
355 	netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
356 }
357 
358 static int gve_rx_alloc_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx,
359 				 const u32 buf_count)
360 {
361 	struct device *hdev = &priv->pdev->dev;
362 
363 	rx->dqo.hdr_bufs.data = dma_alloc_coherent(hdev, priv->header_buf_size * buf_count,
364 						   &rx->dqo.hdr_bufs.addr, GFP_KERNEL);
365 	if (!rx->dqo.hdr_bufs.data)
366 		return -ENOMEM;
367 
368 	return 0;
369 }
370 
371 void gve_rx_start_ring_dqo(struct gve_priv *priv, int idx)
372 {
373 	int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
374 
375 	gve_rx_add_to_block(priv, idx);
376 	gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo);
377 }
378 
379 int gve_rx_alloc_ring_dqo(struct gve_priv *priv,
380 			  struct gve_rx_alloc_rings_cfg *cfg,
381 			  struct gve_rx_ring *rx,
382 			  int idx)
383 {
384 	struct device *hdev = &priv->pdev->dev;
385 	int qpl_page_cnt;
386 	size_t size;
387 	u32 qpl_id;
388 
389 	const u32 buffer_queue_slots = cfg->ring_size;
390 	const u32 completion_queue_slots = cfg->ring_size;
391 
392 	netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n");
393 
394 	memset(rx, 0, sizeof(*rx));
395 	rx->gve = priv;
396 	rx->q_num = idx;
397 
398 	rx->dqo.num_buf_states = cfg->raw_addressing ?
399 		min_t(s16, S16_MAX, buffer_queue_slots * 4) :
400 		gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
401 	rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states,
402 				      sizeof(rx->dqo.buf_states[0]),
403 				      GFP_KERNEL);
404 	if (!rx->dqo.buf_states)
405 		return -ENOMEM;
406 
407 	/* Allocate header buffers for header-split */
408 	if (cfg->enable_header_split)
409 		if (gve_rx_alloc_hdr_bufs(priv, rx, buffer_queue_slots))
410 			goto err;
411 
412 	/* Allocate RX completion queue */
413 	size = sizeof(rx->dqo.complq.desc_ring[0]) *
414 		completion_queue_slots;
415 	rx->dqo.complq.desc_ring =
416 		dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL);
417 	if (!rx->dqo.complq.desc_ring)
418 		goto err;
419 
420 	/* Allocate RX buffer queue */
421 	size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots;
422 	rx->dqo.bufq.desc_ring =
423 		dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL);
424 	if (!rx->dqo.bufq.desc_ring)
425 		goto err;
426 
427 	if (!cfg->raw_addressing) {
428 		qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num);
429 		qpl_page_cnt = gve_get_rx_pages_per_qpl_dqo(cfg->ring_size);
430 
431 		rx->dqo.qpl = gve_alloc_queue_page_list(priv, qpl_id,
432 							qpl_page_cnt);
433 		if (!rx->dqo.qpl)
434 			goto err;
435 		rx->dqo.next_qpl_page_idx = 0;
436 	}
437 
438 	rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources),
439 					     &rx->q_resources_bus, GFP_KERNEL);
440 	if (!rx->q_resources)
441 		goto err;
442 
443 	gve_rx_init_ring_state_dqo(rx, buffer_queue_slots,
444 				   completion_queue_slots);
445 
446 	return 0;
447 
448 err:
449 	gve_rx_free_ring_dqo(priv, rx, cfg);
450 	return -ENOMEM;
451 }
452 
453 void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx)
454 {
455 	const struct gve_rx_ring *rx = &priv->rx[queue_idx];
456 	u64 index = be32_to_cpu(rx->q_resources->db_index);
457 
458 	iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]);
459 }
460 
461 int gve_rx_alloc_rings_dqo(struct gve_priv *priv,
462 			   struct gve_rx_alloc_rings_cfg *cfg)
463 {
464 	struct gve_rx_ring *rx;
465 	int err;
466 	int i;
467 
468 	rx = kvcalloc(cfg->qcfg->max_queues, sizeof(struct gve_rx_ring),
469 		      GFP_KERNEL);
470 	if (!rx)
471 		return -ENOMEM;
472 
473 	for (i = 0; i < cfg->qcfg->num_queues; i++) {
474 		err = gve_rx_alloc_ring_dqo(priv, cfg, &rx[i], i);
475 		if (err) {
476 			netif_err(priv, drv, priv->dev,
477 				  "Failed to alloc rx ring=%d: err=%d\n",
478 				  i, err);
479 			goto err;
480 		}
481 	}
482 
483 	cfg->rx = rx;
484 	return 0;
485 
486 err:
487 	for (i--; i >= 0; i--)
488 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
489 	kvfree(rx);
490 	return err;
491 }
492 
493 void gve_rx_free_rings_dqo(struct gve_priv *priv,
494 			   struct gve_rx_alloc_rings_cfg *cfg)
495 {
496 	struct gve_rx_ring *rx = cfg->rx;
497 	int i;
498 
499 	if (!rx)
500 		return;
501 
502 	for (i = 0; i < cfg->qcfg->num_queues;  i++)
503 		gve_rx_free_ring_dqo(priv, &rx[i], cfg);
504 
505 	kvfree(rx);
506 	cfg->rx = NULL;
507 }
508 
509 void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx)
510 {
511 	struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
512 	struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
513 	struct gve_priv *priv = rx->gve;
514 	u32 num_avail_slots;
515 	u32 num_full_slots;
516 	u32 num_posted = 0;
517 
518 	num_full_slots = (bufq->tail - bufq->head) & bufq->mask;
519 	num_avail_slots = bufq->mask - num_full_slots;
520 
521 	num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots);
522 	while (num_posted < num_avail_slots) {
523 		struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail];
524 		struct gve_rx_buf_state_dqo *buf_state;
525 
526 		buf_state = gve_get_recycled_buf_state(rx);
527 		if (unlikely(!buf_state)) {
528 			buf_state = gve_alloc_buf_state(rx);
529 			if (unlikely(!buf_state))
530 				break;
531 
532 			if (unlikely(gve_alloc_page_dqo(rx, buf_state))) {
533 				u64_stats_update_begin(&rx->statss);
534 				rx->rx_buf_alloc_fail++;
535 				u64_stats_update_end(&rx->statss);
536 				gve_free_buf_state(rx, buf_state);
537 				break;
538 			}
539 		}
540 
541 		desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states);
542 		desc->buf_addr = cpu_to_le64(buf_state->addr +
543 					     buf_state->page_info.page_offset);
544 		if (rx->dqo.hdr_bufs.data)
545 			desc->header_buf_addr =
546 				cpu_to_le64(rx->dqo.hdr_bufs.addr +
547 					    priv->header_buf_size * bufq->tail);
548 
549 		bufq->tail = (bufq->tail + 1) & bufq->mask;
550 		complq->num_free_slots--;
551 		num_posted++;
552 
553 		if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0)
554 			gve_rx_write_doorbell_dqo(priv, rx->q_num);
555 	}
556 
557 	rx->fill_cnt += num_posted;
558 }
559 
560 static void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx,
561 				struct gve_rx_buf_state_dqo *buf_state)
562 {
563 	const u16 data_buffer_size = priv->data_buffer_size_dqo;
564 	int pagecount;
565 
566 	/* Can't reuse if we only fit one buffer per page */
567 	if (data_buffer_size * 2 > PAGE_SIZE)
568 		goto mark_used;
569 
570 	pagecount = gve_buf_ref_cnt(buf_state);
571 
572 	/* Record the offset when we have a single remaining reference.
573 	 *
574 	 * When this happens, we know all of the other offsets of the page are
575 	 * usable.
576 	 */
577 	if (pagecount == 1) {
578 		buf_state->last_single_ref_offset =
579 			buf_state->page_info.page_offset;
580 	}
581 
582 	/* Use the next buffer sized chunk in the page. */
583 	buf_state->page_info.page_offset += data_buffer_size;
584 	buf_state->page_info.page_offset &= (PAGE_SIZE - 1);
585 
586 	/* If we wrap around to the same offset without ever dropping to 1
587 	 * reference, then we don't know if this offset was ever freed.
588 	 */
589 	if (buf_state->page_info.page_offset ==
590 	    buf_state->last_single_ref_offset) {
591 		goto mark_used;
592 	}
593 
594 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
595 	return;
596 
597 mark_used:
598 	gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state);
599 	rx->dqo.used_buf_states_cnt++;
600 }
601 
602 static void gve_rx_skb_csum(struct sk_buff *skb,
603 			    const struct gve_rx_compl_desc_dqo *desc,
604 			    struct gve_ptype ptype)
605 {
606 	skb->ip_summed = CHECKSUM_NONE;
607 
608 	/* HW did not identify and process L3 and L4 headers. */
609 	if (unlikely(!desc->l3_l4_processed))
610 		return;
611 
612 	if (ptype.l3_type == GVE_L3_TYPE_IPV4) {
613 		if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err))
614 			return;
615 	} else if (ptype.l3_type == GVE_L3_TYPE_IPV6) {
616 		/* Checksum should be skipped if this flag is set. */
617 		if (unlikely(desc->ipv6_ex_add))
618 			return;
619 	}
620 
621 	if (unlikely(desc->csum_l4_err))
622 		return;
623 
624 	switch (ptype.l4_type) {
625 	case GVE_L4_TYPE_TCP:
626 	case GVE_L4_TYPE_UDP:
627 	case GVE_L4_TYPE_ICMP:
628 	case GVE_L4_TYPE_SCTP:
629 		skb->ip_summed = CHECKSUM_UNNECESSARY;
630 		break;
631 	default:
632 		break;
633 	}
634 }
635 
636 static void gve_rx_skb_hash(struct sk_buff *skb,
637 			    const struct gve_rx_compl_desc_dqo *compl_desc,
638 			    struct gve_ptype ptype)
639 {
640 	enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2;
641 
642 	if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN)
643 		hash_type = PKT_HASH_TYPE_L4;
644 	else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN)
645 		hash_type = PKT_HASH_TYPE_L3;
646 
647 	skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type);
648 }
649 
650 static void gve_rx_free_skb(struct gve_rx_ring *rx)
651 {
652 	if (!rx->ctx.skb_head)
653 		return;
654 
655 	dev_kfree_skb_any(rx->ctx.skb_head);
656 	rx->ctx.skb_head = NULL;
657 	rx->ctx.skb_tail = NULL;
658 }
659 
660 static bool gve_rx_should_trigger_copy_ondemand(struct gve_rx_ring *rx)
661 {
662 	if (!rx->dqo.qpl)
663 		return false;
664 	if (rx->dqo.used_buf_states_cnt <
665 		     (rx->dqo.num_buf_states -
666 		     GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD))
667 		return false;
668 	return true;
669 }
670 
671 static int gve_rx_copy_ondemand(struct gve_rx_ring *rx,
672 				struct gve_rx_buf_state_dqo *buf_state,
673 				u16 buf_len)
674 {
675 	struct page *page = alloc_page(GFP_ATOMIC);
676 	int num_frags;
677 
678 	if (!page)
679 		return -ENOMEM;
680 
681 	memcpy(page_address(page),
682 	       buf_state->page_info.page_address +
683 	       buf_state->page_info.page_offset,
684 	       buf_len);
685 	num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
686 	skb_add_rx_frag(rx->ctx.skb_tail, num_frags, page,
687 			0, buf_len, PAGE_SIZE);
688 
689 	u64_stats_update_begin(&rx->statss);
690 	rx->rx_frag_alloc_cnt++;
691 	u64_stats_update_end(&rx->statss);
692 	/* Return unused buffer. */
693 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
694 	return 0;
695 }
696 
697 /* Chains multi skbs for single rx packet.
698  * Returns 0 if buffer is appended, -1 otherwise.
699  */
700 static int gve_rx_append_frags(struct napi_struct *napi,
701 			       struct gve_rx_buf_state_dqo *buf_state,
702 			       u16 buf_len, struct gve_rx_ring *rx,
703 			       struct gve_priv *priv)
704 {
705 	int num_frags = skb_shinfo(rx->ctx.skb_tail)->nr_frags;
706 
707 	if (unlikely(num_frags == MAX_SKB_FRAGS)) {
708 		struct sk_buff *skb;
709 
710 		skb = napi_alloc_skb(napi, 0);
711 		if (!skb)
712 			return -1;
713 
714 		if (rx->ctx.skb_tail == rx->ctx.skb_head)
715 			skb_shinfo(rx->ctx.skb_head)->frag_list = skb;
716 		else
717 			rx->ctx.skb_tail->next = skb;
718 		rx->ctx.skb_tail = skb;
719 		num_frags = 0;
720 	}
721 	if (rx->ctx.skb_tail != rx->ctx.skb_head) {
722 		rx->ctx.skb_head->len += buf_len;
723 		rx->ctx.skb_head->data_len += buf_len;
724 		rx->ctx.skb_head->truesize += priv->data_buffer_size_dqo;
725 	}
726 
727 	/* Trigger ondemand page allocation if we are running low on buffers */
728 	if (gve_rx_should_trigger_copy_ondemand(rx))
729 		return gve_rx_copy_ondemand(rx, buf_state, buf_len);
730 
731 	skb_add_rx_frag(rx->ctx.skb_tail, num_frags,
732 			buf_state->page_info.page,
733 			buf_state->page_info.page_offset,
734 			buf_len, priv->data_buffer_size_dqo);
735 	gve_dec_pagecnt_bias(&buf_state->page_info);
736 
737 	/* Advances buffer page-offset if page is partially used.
738 	 * Marks buffer as used if page is full.
739 	 */
740 	gve_try_recycle_buf(priv, rx, buf_state);
741 	return 0;
742 }
743 
744 /* Returns 0 if descriptor is completed successfully.
745  * Returns -EINVAL if descriptor is invalid.
746  * Returns -ENOMEM if data cannot be copied to skb.
747  */
748 static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
749 		      const struct gve_rx_compl_desc_dqo *compl_desc,
750 		      u32 desc_idx, int queue_idx)
751 {
752 	const u16 buffer_id = le16_to_cpu(compl_desc->buf_id);
753 	const bool hbo = compl_desc->header_buffer_overflow;
754 	const bool eop = compl_desc->end_of_packet != 0;
755 	const bool hsplit = compl_desc->split_header;
756 	struct gve_rx_buf_state_dqo *buf_state;
757 	struct gve_priv *priv = rx->gve;
758 	u16 buf_len;
759 	u16 hdr_len;
760 
761 	if (unlikely(buffer_id >= rx->dqo.num_buf_states)) {
762 		net_err_ratelimited("%s: Invalid RX buffer_id=%u\n",
763 				    priv->dev->name, buffer_id);
764 		return -EINVAL;
765 	}
766 	buf_state = &rx->dqo.buf_states[buffer_id];
767 	if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) {
768 		net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n",
769 				    priv->dev->name, buffer_id);
770 		return -EINVAL;
771 	}
772 
773 	if (unlikely(compl_desc->rx_error)) {
774 		gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states,
775 				      buf_state);
776 		return -EINVAL;
777 	}
778 
779 	buf_len = compl_desc->packet_len;
780 	hdr_len = compl_desc->header_len;
781 
782 	/* Page might have not been used for awhile and was likely last written
783 	 * by a different thread.
784 	 */
785 	prefetch(buf_state->page_info.page);
786 
787 	/* Copy the header into the skb in the case of header split */
788 	if (hsplit) {
789 		int unsplit = 0;
790 
791 		if (hdr_len && !hbo) {
792 			rx->ctx.skb_head = gve_rx_copy_data(priv->dev, napi,
793 							    rx->dqo.hdr_bufs.data +
794 							    desc_idx * priv->header_buf_size,
795 							    hdr_len);
796 			if (unlikely(!rx->ctx.skb_head))
797 				goto error;
798 			rx->ctx.skb_tail = rx->ctx.skb_head;
799 		} else {
800 			unsplit = 1;
801 		}
802 		u64_stats_update_begin(&rx->statss);
803 		rx->rx_hsplit_pkt++;
804 		rx->rx_hsplit_unsplit_pkt += unsplit;
805 		rx->rx_hsplit_bytes += hdr_len;
806 		u64_stats_update_end(&rx->statss);
807 	}
808 
809 	/* Sync the portion of dma buffer for CPU to read. */
810 	dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr,
811 				      buf_state->page_info.page_offset,
812 				      buf_len, DMA_FROM_DEVICE);
813 
814 	/* Append to current skb if one exists. */
815 	if (rx->ctx.skb_head) {
816 		if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx,
817 						 priv)) != 0) {
818 			goto error;
819 		}
820 		return 0;
821 	}
822 
823 	if (eop && buf_len <= priv->rx_copybreak) {
824 		rx->ctx.skb_head = gve_rx_copy(priv->dev, napi,
825 					       &buf_state->page_info, buf_len);
826 		if (unlikely(!rx->ctx.skb_head))
827 			goto error;
828 		rx->ctx.skb_tail = rx->ctx.skb_head;
829 
830 		u64_stats_update_begin(&rx->statss);
831 		rx->rx_copied_pkt++;
832 		rx->rx_copybreak_pkt++;
833 		u64_stats_update_end(&rx->statss);
834 
835 		gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states,
836 				      buf_state);
837 		return 0;
838 	}
839 
840 	rx->ctx.skb_head = napi_get_frags(napi);
841 	if (unlikely(!rx->ctx.skb_head))
842 		goto error;
843 	rx->ctx.skb_tail = rx->ctx.skb_head;
844 
845 	if (gve_rx_should_trigger_copy_ondemand(rx)) {
846 		if (gve_rx_copy_ondemand(rx, buf_state, buf_len) < 0)
847 			goto error;
848 		return 0;
849 	}
850 
851 	skb_add_rx_frag(rx->ctx.skb_head, 0, buf_state->page_info.page,
852 			buf_state->page_info.page_offset, buf_len,
853 			priv->data_buffer_size_dqo);
854 	gve_dec_pagecnt_bias(&buf_state->page_info);
855 
856 	gve_try_recycle_buf(priv, rx, buf_state);
857 	return 0;
858 
859 error:
860 	gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state);
861 	return -ENOMEM;
862 }
863 
864 static int gve_rx_complete_rsc(struct sk_buff *skb,
865 			       const struct gve_rx_compl_desc_dqo *desc,
866 			       struct gve_ptype ptype)
867 {
868 	struct skb_shared_info *shinfo = skb_shinfo(skb);
869 
870 	/* Only TCP is supported right now. */
871 	if (ptype.l4_type != GVE_L4_TYPE_TCP)
872 		return -EINVAL;
873 
874 	switch (ptype.l3_type) {
875 	case GVE_L3_TYPE_IPV4:
876 		shinfo->gso_type = SKB_GSO_TCPV4;
877 		break;
878 	case GVE_L3_TYPE_IPV6:
879 		shinfo->gso_type = SKB_GSO_TCPV6;
880 		break;
881 	default:
882 		return -EINVAL;
883 	}
884 
885 	shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len);
886 	return 0;
887 }
888 
889 /* Returns 0 if skb is completed successfully, -1 otherwise. */
890 static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi,
891 			       const struct gve_rx_compl_desc_dqo *desc,
892 			       netdev_features_t feat)
893 {
894 	struct gve_ptype ptype =
895 		rx->gve->ptype_lut_dqo->ptypes[desc->packet_type];
896 	int err;
897 
898 	skb_record_rx_queue(rx->ctx.skb_head, rx->q_num);
899 
900 	if (feat & NETIF_F_RXHASH)
901 		gve_rx_skb_hash(rx->ctx.skb_head, desc, ptype);
902 
903 	if (feat & NETIF_F_RXCSUM)
904 		gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype);
905 
906 	/* RSC packets must set gso_size otherwise the TCP stack will complain
907 	 * that packets are larger than MTU.
908 	 */
909 	if (desc->rsc) {
910 		err = gve_rx_complete_rsc(rx->ctx.skb_head, desc, ptype);
911 		if (err < 0)
912 			return err;
913 	}
914 
915 	if (skb_headlen(rx->ctx.skb_head) == 0)
916 		napi_gro_frags(napi);
917 	else
918 		napi_gro_receive(napi, rx->ctx.skb_head);
919 
920 	return 0;
921 }
922 
923 int gve_rx_poll_dqo(struct gve_notify_block *block, int budget)
924 {
925 	struct napi_struct *napi = &block->napi;
926 	netdev_features_t feat = napi->dev->features;
927 
928 	struct gve_rx_ring *rx = block->rx;
929 	struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq;
930 
931 	u32 work_done = 0;
932 	u64 bytes = 0;
933 	int err;
934 
935 	while (work_done < budget) {
936 		struct gve_rx_compl_desc_dqo *compl_desc =
937 			&complq->desc_ring[complq->head];
938 		u32 pkt_bytes;
939 
940 		/* No more new packets */
941 		if (compl_desc->generation == complq->cur_gen_bit)
942 			break;
943 
944 		/* Prefetch the next two descriptors. */
945 		prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]);
946 		prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]);
947 
948 		/* Do not read data until we own the descriptor */
949 		dma_rmb();
950 
951 		err = gve_rx_dqo(napi, rx, compl_desc, complq->head, rx->q_num);
952 		if (err < 0) {
953 			gve_rx_free_skb(rx);
954 			u64_stats_update_begin(&rx->statss);
955 			if (err == -ENOMEM)
956 				rx->rx_skb_alloc_fail++;
957 			else if (err == -EINVAL)
958 				rx->rx_desc_err_dropped_pkt++;
959 			u64_stats_update_end(&rx->statss);
960 		}
961 
962 		complq->head = (complq->head + 1) & complq->mask;
963 		complq->num_free_slots++;
964 
965 		/* When the ring wraps, the generation bit is flipped. */
966 		complq->cur_gen_bit ^= (complq->head == 0);
967 
968 		/* Receiving a completion means we have space to post another
969 		 * buffer on the buffer queue.
970 		 */
971 		{
972 			struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq;
973 
974 			bufq->head = (bufq->head + 1) & bufq->mask;
975 		}
976 
977 		/* Free running counter of completed descriptors */
978 		rx->cnt++;
979 
980 		if (!rx->ctx.skb_head)
981 			continue;
982 
983 		if (!compl_desc->end_of_packet)
984 			continue;
985 
986 		work_done++;
987 		pkt_bytes = rx->ctx.skb_head->len;
988 		/* The ethernet header (first ETH_HLEN bytes) is snipped off
989 		 * by eth_type_trans.
990 		 */
991 		if (skb_headlen(rx->ctx.skb_head))
992 			pkt_bytes += ETH_HLEN;
993 
994 		/* gve_rx_complete_skb() will consume skb if successful */
995 		if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) {
996 			gve_rx_free_skb(rx);
997 			u64_stats_update_begin(&rx->statss);
998 			rx->rx_desc_err_dropped_pkt++;
999 			u64_stats_update_end(&rx->statss);
1000 			continue;
1001 		}
1002 
1003 		bytes += pkt_bytes;
1004 		rx->ctx.skb_head = NULL;
1005 		rx->ctx.skb_tail = NULL;
1006 	}
1007 
1008 	gve_rx_post_buffers_dqo(rx);
1009 
1010 	u64_stats_update_begin(&rx->statss);
1011 	rx->rpackets += work_done;
1012 	rx->rbytes += bytes;
1013 	u64_stats_update_end(&rx->statss);
1014 
1015 	return work_done;
1016 }
1017