xref: /linux/drivers/net/ethernet/google/gve/gve_tx_dqo.c (revision 34f2573661e3e644efaf383178af634a2fd67828)
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6 
7 #include "gve.h"
8 #include "gve_adminq.h"
9 #include "gve_utils.h"
10 #include "gve_dqo.h"
11 #include <net/ip.h>
12 #include <linux/bpf.h>
13 #include <linux/tcp.h>
14 #include <linux/slab.h>
15 #include <linux/skbuff.h>
16 
17 /* Returns true if tx_bufs are available. */
18 static bool gve_has_free_tx_qpl_bufs(struct gve_tx_ring *tx, int count)
19 {
20 	int num_avail;
21 
22 	if (!tx->dqo.qpl)
23 		return true;
24 
25 	num_avail = tx->dqo.num_tx_qpl_bufs -
26 		(tx->dqo_tx.alloc_tx_qpl_buf_cnt -
27 		 tx->dqo_tx.free_tx_qpl_buf_cnt);
28 
29 	if (count <= num_avail)
30 		return true;
31 
32 	/* Update cached value from dqo_compl. */
33 	tx->dqo_tx.free_tx_qpl_buf_cnt =
34 		atomic_read_acquire(&tx->dqo_compl.free_tx_qpl_buf_cnt);
35 
36 	num_avail = tx->dqo.num_tx_qpl_bufs -
37 		(tx->dqo_tx.alloc_tx_qpl_buf_cnt -
38 		 tx->dqo_tx.free_tx_qpl_buf_cnt);
39 
40 	return count <= num_avail;
41 }
42 
43 static s16
44 gve_alloc_tx_qpl_buf(struct gve_tx_ring *tx)
45 {
46 	s16 index;
47 
48 	index = tx->dqo_tx.free_tx_qpl_buf_head;
49 
50 	/* No TX buffers available, try to steal the list from the
51 	 * completion handler.
52 	 */
53 	if (unlikely(index == -1)) {
54 		tx->dqo_tx.free_tx_qpl_buf_head =
55 			atomic_xchg(&tx->dqo_compl.free_tx_qpl_buf_head, -1);
56 		index = tx->dqo_tx.free_tx_qpl_buf_head;
57 
58 		if (unlikely(index == -1))
59 			return index;
60 	}
61 
62 	/* Remove TX buf from free list */
63 	tx->dqo_tx.free_tx_qpl_buf_head = tx->dqo.tx_qpl_buf_next[index];
64 
65 	return index;
66 }
67 
68 static void
69 gve_free_tx_qpl_bufs(struct gve_tx_ring *tx,
70 		     struct gve_tx_pending_packet_dqo *pkt)
71 {
72 	s16 index;
73 	int i;
74 
75 	if (!pkt->num_bufs)
76 		return;
77 
78 	index = pkt->tx_qpl_buf_ids[0];
79 	/* Create a linked list of buffers to be added to the free list */
80 	for (i = 1; i < pkt->num_bufs; i++) {
81 		tx->dqo.tx_qpl_buf_next[index] = pkt->tx_qpl_buf_ids[i];
82 		index = pkt->tx_qpl_buf_ids[i];
83 	}
84 
85 	while (true) {
86 		s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_tx_qpl_buf_head);
87 
88 		tx->dqo.tx_qpl_buf_next[index] = old_head;
89 		if (atomic_cmpxchg(&tx->dqo_compl.free_tx_qpl_buf_head,
90 				   old_head,
91 				   pkt->tx_qpl_buf_ids[0]) == old_head) {
92 			break;
93 		}
94 	}
95 
96 	atomic_add(pkt->num_bufs, &tx->dqo_compl.free_tx_qpl_buf_cnt);
97 	pkt->num_bufs = 0;
98 }
99 
100 /* Returns true if a gve_tx_pending_packet_dqo object is available. */
101 static bool gve_has_pending_packet(struct gve_tx_ring *tx)
102 {
103 	/* Check TX path's list. */
104 	if (tx->dqo_tx.free_pending_packets != -1)
105 		return true;
106 
107 	/* Check completion handler's list. */
108 	if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1)
109 		return true;
110 
111 	return false;
112 }
113 
114 void gve_xdp_tx_flush_dqo(struct gve_priv *priv, u32 xdp_qid)
115 {
116 	u32 tx_qid = gve_xdp_tx_queue_id(priv, xdp_qid);
117 	struct gve_tx_ring *tx = &priv->tx[tx_qid];
118 
119 	gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
120 }
121 
122 static struct gve_tx_pending_packet_dqo *
123 gve_alloc_pending_packet(struct gve_tx_ring *tx)
124 {
125 	struct gve_tx_pending_packet_dqo *pending_packet;
126 	s16 index;
127 
128 	index = tx->dqo_tx.free_pending_packets;
129 
130 	/* No pending_packets available, try to steal the list from the
131 	 * completion handler.
132 	 */
133 	if (unlikely(index == -1)) {
134 		tx->dqo_tx.free_pending_packets =
135 			atomic_xchg(&tx->dqo_compl.free_pending_packets, -1);
136 		index = tx->dqo_tx.free_pending_packets;
137 
138 		if (unlikely(index == -1))
139 			return NULL;
140 	}
141 
142 	pending_packet = &tx->dqo.pending_packets[index];
143 
144 	/* Remove pending_packet from free list */
145 	tx->dqo_tx.free_pending_packets = pending_packet->next;
146 	pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
147 
148 	return pending_packet;
149 }
150 
151 static void
152 gve_free_pending_packet(struct gve_tx_ring *tx,
153 			struct gve_tx_pending_packet_dqo *pending_packet)
154 {
155 	s16 index = pending_packet - tx->dqo.pending_packets;
156 
157 	pending_packet->state = GVE_PACKET_STATE_UNALLOCATED;
158 	while (true) {
159 		s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets);
160 
161 		pending_packet->next = old_head;
162 		if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets,
163 				   old_head, index) == old_head) {
164 			break;
165 		}
166 	}
167 }
168 
169 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers.
170  */
171 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx)
172 {
173 	int i;
174 
175 	for (i = 0; i < tx->dqo.num_pending_packets; i++) {
176 		struct gve_tx_pending_packet_dqo *cur_state =
177 			&tx->dqo.pending_packets[i];
178 		int j;
179 
180 		for (j = 0; j < cur_state->num_bufs; j++) {
181 			if (j == 0) {
182 				dma_unmap_single(tx->dev,
183 					dma_unmap_addr(cur_state, dma[j]),
184 					dma_unmap_len(cur_state, len[j]),
185 					DMA_TO_DEVICE);
186 			} else {
187 				dma_unmap_page(tx->dev,
188 					dma_unmap_addr(cur_state, dma[j]),
189 					dma_unmap_len(cur_state, len[j]),
190 					DMA_TO_DEVICE);
191 			}
192 		}
193 		if (cur_state->skb) {
194 			dev_consume_skb_any(cur_state->skb);
195 			cur_state->skb = NULL;
196 		}
197 	}
198 }
199 
200 void gve_tx_stop_ring_dqo(struct gve_priv *priv, int idx)
201 {
202 	int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx);
203 	struct gve_tx_ring *tx = &priv->tx[idx];
204 
205 	if (!gve_tx_was_added_to_block(priv, idx))
206 		return;
207 
208 	gve_remove_napi(priv, ntfy_idx);
209 	gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL);
210 	if (tx->netdev_txq)
211 		netdev_tx_reset_queue(tx->netdev_txq);
212 	gve_tx_clean_pending_packets(tx);
213 	gve_tx_remove_from_block(priv, idx);
214 }
215 
216 static void gve_tx_free_ring_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
217 				 struct gve_tx_alloc_rings_cfg *cfg)
218 {
219 	struct device *hdev = &priv->pdev->dev;
220 	int idx = tx->q_num;
221 	size_t bytes;
222 	u32 qpl_id;
223 
224 	if (tx->q_resources) {
225 		dma_free_coherent(hdev, sizeof(*tx->q_resources),
226 				  tx->q_resources, tx->q_resources_bus);
227 		tx->q_resources = NULL;
228 	}
229 
230 	if (tx->dqo.compl_ring) {
231 		bytes = sizeof(tx->dqo.compl_ring[0]) *
232 			(tx->dqo.complq_mask + 1);
233 		dma_free_coherent(hdev, bytes, tx->dqo.compl_ring,
234 				  tx->complq_bus_dqo);
235 		tx->dqo.compl_ring = NULL;
236 	}
237 
238 	if (tx->dqo.tx_ring) {
239 		bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
240 		dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus);
241 		tx->dqo.tx_ring = NULL;
242 	}
243 
244 	kvfree(tx->dqo.pending_packets);
245 	tx->dqo.pending_packets = NULL;
246 
247 	kvfree(tx->dqo.tx_qpl_buf_next);
248 	tx->dqo.tx_qpl_buf_next = NULL;
249 
250 	if (tx->dqo.qpl) {
251 		qpl_id = gve_tx_qpl_id(priv, tx->q_num);
252 		gve_free_queue_page_list(priv, tx->dqo.qpl, qpl_id);
253 		tx->dqo.qpl = NULL;
254 	}
255 
256 	netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);
257 }
258 
259 static int gve_tx_qpl_buf_init(struct gve_tx_ring *tx)
260 {
261 	int num_tx_qpl_bufs = GVE_TX_BUFS_PER_PAGE_DQO *
262 		tx->dqo.qpl->num_entries;
263 	int i;
264 
265 	tx->dqo.tx_qpl_buf_next = kvcalloc(num_tx_qpl_bufs,
266 					   sizeof(tx->dqo.tx_qpl_buf_next[0]),
267 					   GFP_KERNEL);
268 	if (!tx->dqo.tx_qpl_buf_next)
269 		return -ENOMEM;
270 
271 	tx->dqo.num_tx_qpl_bufs = num_tx_qpl_bufs;
272 
273 	/* Generate free TX buf list */
274 	for (i = 0; i < num_tx_qpl_bufs - 1; i++)
275 		tx->dqo.tx_qpl_buf_next[i] = i + 1;
276 	tx->dqo.tx_qpl_buf_next[num_tx_qpl_bufs - 1] = -1;
277 
278 	atomic_set_release(&tx->dqo_compl.free_tx_qpl_buf_head, -1);
279 	return 0;
280 }
281 
282 void gve_tx_start_ring_dqo(struct gve_priv *priv, int idx)
283 {
284 	int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx);
285 	struct gve_tx_ring *tx = &priv->tx[idx];
286 
287 	gve_tx_add_to_block(priv, idx);
288 
289 	if (idx < priv->tx_cfg.num_queues)
290 		tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
291 	gve_add_napi(priv, ntfy_idx, gve_napi_poll_dqo);
292 }
293 
294 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv,
295 				 struct gve_tx_alloc_rings_cfg *cfg,
296 				 struct gve_tx_ring *tx,
297 				 int idx)
298 {
299 	struct device *hdev = &priv->pdev->dev;
300 	int num_pending_packets;
301 	int qpl_page_cnt;
302 	size_t bytes;
303 	u32 qpl_id;
304 	int i;
305 
306 	memset(tx, 0, sizeof(*tx));
307 	tx->q_num = idx;
308 	tx->dev = hdev;
309 	spin_lock_init(&tx->dqo_tx.xdp_lock);
310 	atomic_set_release(&tx->dqo_compl.hw_tx_head, 0);
311 
312 	/* Queue sizes must be a power of 2 */
313 	tx->mask = cfg->ring_size - 1;
314 	tx->dqo.complq_mask = tx->mask;
315 
316 	/* The max number of pending packets determines the maximum number of
317 	 * descriptors which maybe written to the completion queue.
318 	 *
319 	 * We must set the number small enough to make sure we never overrun the
320 	 * completion queue.
321 	 */
322 	num_pending_packets = tx->dqo.complq_mask + 1;
323 
324 	/* Reserve space for descriptor completions, which will be reported at
325 	 * most every GVE_TX_MIN_RE_INTERVAL packets.
326 	 */
327 	num_pending_packets -=
328 		(tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL;
329 
330 	/* Each packet may have at most 2 buffer completions if it receives both
331 	 * a miss and reinjection completion.
332 	 */
333 	num_pending_packets /= 2;
334 
335 	tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX);
336 	tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets,
337 					   sizeof(tx->dqo.pending_packets[0]),
338 					   GFP_KERNEL);
339 	if (!tx->dqo.pending_packets)
340 		goto err;
341 
342 	/* Set up linked list of pending packets */
343 	for (i = 0; i < tx->dqo.num_pending_packets - 1; i++)
344 		tx->dqo.pending_packets[i].next = i + 1;
345 
346 	tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1;
347 	atomic_set_release(&tx->dqo_compl.free_pending_packets, -1);
348 	tx->dqo_compl.miss_completions.head = -1;
349 	tx->dqo_compl.miss_completions.tail = -1;
350 	tx->dqo_compl.timed_out_completions.head = -1;
351 	tx->dqo_compl.timed_out_completions.tail = -1;
352 
353 	bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
354 	tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL);
355 	if (!tx->dqo.tx_ring)
356 		goto err;
357 
358 	bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1);
359 	tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes,
360 						&tx->complq_bus_dqo,
361 						GFP_KERNEL);
362 	if (!tx->dqo.compl_ring)
363 		goto err;
364 
365 	tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources),
366 					     &tx->q_resources_bus, GFP_KERNEL);
367 	if (!tx->q_resources)
368 		goto err;
369 
370 	if (!cfg->raw_addressing) {
371 		qpl_id = gve_tx_qpl_id(priv, tx->q_num);
372 		qpl_page_cnt = priv->tx_pages_per_qpl;
373 
374 		tx->dqo.qpl = gve_alloc_queue_page_list(priv, qpl_id,
375 							qpl_page_cnt);
376 		if (!tx->dqo.qpl)
377 			goto err;
378 
379 		if (gve_tx_qpl_buf_init(tx))
380 			goto err;
381 	}
382 
383 	return 0;
384 
385 err:
386 	gve_tx_free_ring_dqo(priv, tx, cfg);
387 	return -ENOMEM;
388 }
389 
390 int gve_tx_alloc_rings_dqo(struct gve_priv *priv,
391 			   struct gve_tx_alloc_rings_cfg *cfg)
392 {
393 	struct gve_tx_ring *tx = cfg->tx;
394 	int total_queues;
395 	int err = 0;
396 	int i, j;
397 
398 	total_queues = cfg->qcfg->num_queues + cfg->num_xdp_rings;
399 	if (total_queues > cfg->qcfg->max_queues) {
400 		netif_err(priv, drv, priv->dev,
401 			  "Cannot alloc more than the max num of Tx rings\n");
402 		return -EINVAL;
403 	}
404 
405 	tx = kvcalloc(cfg->qcfg->max_queues, sizeof(struct gve_tx_ring),
406 		      GFP_KERNEL);
407 	if (!tx)
408 		return -ENOMEM;
409 
410 	for (i = 0; i < total_queues; i++) {
411 		err = gve_tx_alloc_ring_dqo(priv, cfg, &tx[i], i);
412 		if (err) {
413 			netif_err(priv, drv, priv->dev,
414 				  "Failed to alloc tx ring=%d: err=%d\n",
415 				  i, err);
416 			goto err;
417 		}
418 	}
419 
420 	cfg->tx = tx;
421 	return 0;
422 
423 err:
424 	for (j = 0; j < i; j++)
425 		gve_tx_free_ring_dqo(priv, &tx[j], cfg);
426 	kvfree(tx);
427 	return err;
428 }
429 
430 void gve_tx_free_rings_dqo(struct gve_priv *priv,
431 			   struct gve_tx_alloc_rings_cfg *cfg)
432 {
433 	struct gve_tx_ring *tx = cfg->tx;
434 	int i;
435 
436 	if (!tx)
437 		return;
438 
439 	for (i = 0; i < cfg->qcfg->num_queues + cfg->qcfg->num_xdp_queues; i++)
440 		gve_tx_free_ring_dqo(priv, &tx[i], cfg);
441 
442 	kvfree(tx);
443 	cfg->tx = NULL;
444 }
445 
446 /* Returns the number of slots available in the ring */
447 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx)
448 {
449 	u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask;
450 
451 	return tx->mask - num_used;
452 }
453 
454 /* Checks if the requested number of slots are available in the ring */
455 static bool gve_has_tx_slots_available(struct gve_tx_ring *tx, u32 slots_req)
456 {
457 	u32 num_avail = num_avail_tx_slots(tx);
458 
459 	slots_req += GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP;
460 
461 	if (num_avail >= slots_req)
462 		return true;
463 
464 	/* Update cached TX head pointer */
465 	tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
466 
467 	return num_avail_tx_slots(tx) >= slots_req;
468 }
469 
470 static bool gve_has_avail_slots_tx_dqo(struct gve_tx_ring *tx,
471 				       int desc_count, int buf_count)
472 {
473 	return gve_has_pending_packet(tx) &&
474 		gve_has_tx_slots_available(tx, desc_count) &&
475 		gve_has_free_tx_qpl_bufs(tx, buf_count);
476 }
477 
478 /* Stops the queue if available descriptors is less than 'count'.
479  * Return: 0 if stop is not required.
480  */
481 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx,
482 				 int desc_count, int buf_count)
483 {
484 	if (likely(gve_has_avail_slots_tx_dqo(tx, desc_count, buf_count)))
485 		return 0;
486 
487 	/* No space, so stop the queue */
488 	tx->stop_queue++;
489 	netif_tx_stop_queue(tx->netdev_txq);
490 
491 	/* Sync with restarting queue in `gve_tx_poll_dqo()` */
492 	mb();
493 
494 	/* After stopping queue, check if we can transmit again in order to
495 	 * avoid TOCTOU bug.
496 	 */
497 	if (likely(!gve_has_avail_slots_tx_dqo(tx, desc_count, buf_count)))
498 		return -EBUSY;
499 
500 	netif_tx_start_queue(tx->netdev_txq);
501 	tx->wake_queue++;
502 	return 0;
503 }
504 
505 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb,
506 					struct gve_tx_metadata_dqo *metadata)
507 {
508 	memset(metadata, 0, sizeof(*metadata));
509 	metadata->version = GVE_TX_METADATA_VERSION_DQO;
510 
511 	if (skb->l4_hash) {
512 		u16 path_hash = skb->hash ^ (skb->hash >> 16);
513 
514 		path_hash &= (1 << 15) - 1;
515 		if (unlikely(path_hash == 0))
516 			path_hash = ~path_hash;
517 
518 		metadata->path_hash = path_hash;
519 	}
520 }
521 
522 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx,
523 				     bool enable_csum, u32 len, u64 addr,
524 				     s16 compl_tag, bool eop, bool is_gso)
525 {
526 	while (len > 0) {
527 		struct gve_tx_pkt_desc_dqo *desc =
528 			&tx->dqo.tx_ring[*desc_idx].pkt;
529 		u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO);
530 		bool cur_eop = eop && cur_len == len;
531 
532 		*desc = (struct gve_tx_pkt_desc_dqo){
533 			.buf_addr = cpu_to_le64(addr),
534 			.dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
535 			.end_of_packet = cur_eop,
536 			.checksum_offload_enable = enable_csum,
537 			.compl_tag = cpu_to_le16(compl_tag),
538 			.buf_size = cur_len,
539 		};
540 
541 		addr += cur_len;
542 		len -= cur_len;
543 		*desc_idx = (*desc_idx + 1) & tx->mask;
544 	}
545 }
546 
547 /* Validates and prepares `skb` for TSO.
548  *
549  * Returns header length, or < 0 if invalid.
550  */
551 static int gve_prep_tso(struct sk_buff *skb)
552 {
553 	struct tcphdr *tcp;
554 	int header_len;
555 	u32 paylen;
556 	int err;
557 
558 	/* Note: HW requires MSS (gso_size) to be <= 9728 and the total length
559 	 * of the TSO to be <= 262143.
560 	 *
561 	 * However, we don't validate these because:
562 	 * - Hypervisor enforces a limit of 9K MTU
563 	 * - Kernel will not produce a TSO larger than 64k
564 	 */
565 
566 	if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO))
567 		return -1;
568 
569 	if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
570 		return -EINVAL;
571 
572 	/* Needed because we will modify header. */
573 	err = skb_cow_head(skb, 0);
574 	if (err < 0)
575 		return err;
576 
577 	tcp = tcp_hdr(skb);
578 	paylen = skb->len - skb_transport_offset(skb);
579 	csum_replace_by_diff(&tcp->check, (__force __wsum)htonl(paylen));
580 	header_len = skb_tcp_all_headers(skb);
581 
582 	if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO))
583 		return -EINVAL;
584 
585 	return header_len;
586 }
587 
588 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc,
589 				     const struct sk_buff *skb,
590 				     const struct gve_tx_metadata_dqo *metadata,
591 				     int header_len)
592 {
593 	*desc = (struct gve_tx_tso_context_desc_dqo){
594 		.header_len = header_len,
595 		.cmd_dtype = {
596 			.dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO,
597 			.tso = 1,
598 		},
599 		.flex0 = metadata->bytes[0],
600 		.flex5 = metadata->bytes[5],
601 		.flex6 = metadata->bytes[6],
602 		.flex7 = metadata->bytes[7],
603 		.flex8 = metadata->bytes[8],
604 		.flex9 = metadata->bytes[9],
605 		.flex10 = metadata->bytes[10],
606 		.flex11 = metadata->bytes[11],
607 	};
608 	desc->tso_total_len = skb->len - header_len;
609 	desc->mss = skb_shinfo(skb)->gso_size;
610 }
611 
612 static void
613 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
614 			     const struct gve_tx_metadata_dqo *metadata)
615 {
616 	*desc = (struct gve_tx_general_context_desc_dqo){
617 		.flex0 = metadata->bytes[0],
618 		.flex1 = metadata->bytes[1],
619 		.flex2 = metadata->bytes[2],
620 		.flex3 = metadata->bytes[3],
621 		.flex4 = metadata->bytes[4],
622 		.flex5 = metadata->bytes[5],
623 		.flex6 = metadata->bytes[6],
624 		.flex7 = metadata->bytes[7],
625 		.flex8 = metadata->bytes[8],
626 		.flex9 = metadata->bytes[9],
627 		.flex10 = metadata->bytes[10],
628 		.flex11 = metadata->bytes[11],
629 		.cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO},
630 	};
631 }
632 
633 static void gve_tx_update_tail(struct gve_tx_ring *tx, u32 desc_idx)
634 {
635 	u32 last_desc_idx = (desc_idx - 1) & tx->mask;
636 	u32 last_report_event_interval =
637 			(last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask;
638 
639 	/* Commit the changes to our state */
640 	tx->dqo_tx.tail = desc_idx;
641 
642 	/* Request a descriptor completion on the last descriptor of the
643 	 * packet if we are allowed to by the HW enforced interval.
644 	 */
645 
646 	if (unlikely(last_report_event_interval >= GVE_TX_MIN_RE_INTERVAL)) {
647 		tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true;
648 		tx->dqo_tx.last_re_idx = last_desc_idx;
649 	}
650 }
651 
652 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
653 				      struct sk_buff *skb,
654 				      struct gve_tx_pending_packet_dqo *pkt,
655 				      s16 completion_tag,
656 				      u32 *desc_idx,
657 				      bool is_gso)
658 {
659 	bool enable_csum = skb->ip_summed == CHECKSUM_PARTIAL;
660 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
661 	int i;
662 
663 	/* Note: HW requires that the size of a non-TSO packet be within the
664 	 * range of [17, 9728].
665 	 *
666 	 * We don't double check because
667 	 * - We limited `netdev->min_mtu` to ETH_MIN_MTU.
668 	 * - Hypervisor won't allow MTU larger than 9216.
669 	 */
670 
671 	pkt->num_bufs = 0;
672 	/* Map the linear portion of skb */
673 	{
674 		u32 len = skb_headlen(skb);
675 		dma_addr_t addr;
676 
677 		addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
678 		if (unlikely(dma_mapping_error(tx->dev, addr)))
679 			goto err;
680 
681 		dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
682 		dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
683 		++pkt->num_bufs;
684 
685 		gve_tx_fill_pkt_desc_dqo(tx, desc_idx, enable_csum, len, addr,
686 					 completion_tag,
687 					 /*eop=*/shinfo->nr_frags == 0, is_gso);
688 	}
689 
690 	for (i = 0; i < shinfo->nr_frags; i++) {
691 		const skb_frag_t *frag = &shinfo->frags[i];
692 		bool is_eop = i == (shinfo->nr_frags - 1);
693 		u32 len = skb_frag_size(frag);
694 		dma_addr_t addr;
695 
696 		addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
697 		if (unlikely(dma_mapping_error(tx->dev, addr)))
698 			goto err;
699 
700 		dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
701 		netmem_dma_unmap_addr_set(skb_frag_netmem(frag), pkt,
702 					  dma[pkt->num_bufs], addr);
703 		++pkt->num_bufs;
704 
705 		gve_tx_fill_pkt_desc_dqo(tx, desc_idx, enable_csum, len, addr,
706 					 completion_tag, is_eop, is_gso);
707 	}
708 
709 	return 0;
710 err:
711 	for (i = 0; i < pkt->num_bufs; i++) {
712 		if (i == 0) {
713 			dma_unmap_single(tx->dev,
714 					 dma_unmap_addr(pkt, dma[i]),
715 					 dma_unmap_len(pkt, len[i]),
716 					 DMA_TO_DEVICE);
717 		} else {
718 			dma_unmap_page(tx->dev,
719 				       dma_unmap_addr(pkt, dma[i]),
720 				       dma_unmap_len(pkt, len[i]),
721 				       DMA_TO_DEVICE);
722 		}
723 	}
724 	pkt->num_bufs = 0;
725 	return -1;
726 }
727 
728 /* Tx buffer i corresponds to
729  * qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO
730  * qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO
731  */
732 static void gve_tx_buf_get_addr(struct gve_tx_ring *tx,
733 				s16 index,
734 				void **va, dma_addr_t *dma_addr)
735 {
736 	int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO);
737 	int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) << GVE_TX_BUF_SHIFT_DQO;
738 
739 	*va = page_address(tx->dqo.qpl->pages[page_id]) + offset;
740 	*dma_addr = tx->dqo.qpl->page_buses[page_id] + offset;
741 }
742 
743 static int gve_tx_add_skb_copy_dqo(struct gve_tx_ring *tx,
744 				   struct sk_buff *skb,
745 				   struct gve_tx_pending_packet_dqo *pkt,
746 				   s16 completion_tag,
747 				   u32 *desc_idx,
748 				   bool is_gso)
749 {
750 	bool enable_csum = skb->ip_summed == CHECKSUM_PARTIAL;
751 	u32 copy_offset = 0;
752 	dma_addr_t dma_addr;
753 	u32 copy_len;
754 	s16 index;
755 	void *va;
756 
757 	/* Break the packet into buffer size chunks */
758 	pkt->num_bufs = 0;
759 	while (copy_offset < skb->len) {
760 		index = gve_alloc_tx_qpl_buf(tx);
761 		if (unlikely(index == -1))
762 			goto err;
763 
764 		gve_tx_buf_get_addr(tx, index, &va, &dma_addr);
765 		copy_len = min_t(u32, GVE_TX_BUF_SIZE_DQO,
766 				 skb->len - copy_offset);
767 		skb_copy_bits(skb, copy_offset, va, copy_len);
768 
769 		copy_offset += copy_len;
770 		dma_sync_single_for_device(tx->dev, dma_addr,
771 					   copy_len, DMA_TO_DEVICE);
772 		gve_tx_fill_pkt_desc_dqo(tx, desc_idx, enable_csum,
773 					 copy_len,
774 					 dma_addr,
775 					 completion_tag,
776 					 copy_offset == skb->len,
777 					 is_gso);
778 
779 		pkt->tx_qpl_buf_ids[pkt->num_bufs] = index;
780 		++tx->dqo_tx.alloc_tx_qpl_buf_cnt;
781 		++pkt->num_bufs;
782 	}
783 
784 	return 0;
785 err:
786 	/* Should not be here if gve_has_free_tx_qpl_bufs() check is correct */
787 	gve_free_tx_qpl_bufs(tx, pkt);
788 	return -ENOMEM;
789 }
790 
791 /* Returns 0 on success, or < 0 on error.
792  *
793  * Before this function is called, the caller must ensure
794  * gve_has_pending_packet(tx) returns true.
795  */
796 static int gve_tx_add_skb_dqo(struct gve_tx_ring *tx,
797 			      struct sk_buff *skb)
798 {
799 	const bool is_gso = skb_is_gso(skb);
800 	u32 desc_idx = tx->dqo_tx.tail;
801 	struct gve_tx_pending_packet_dqo *pkt;
802 	struct gve_tx_metadata_dqo metadata;
803 	s16 completion_tag;
804 
805 	pkt = gve_alloc_pending_packet(tx);
806 	if (!pkt)
807 		return -ENOMEM;
808 
809 	pkt->skb = skb;
810 	pkt->type = GVE_TX_PENDING_PACKET_DQO_SKB;
811 	completion_tag = pkt - tx->dqo.pending_packets;
812 
813 	gve_extract_tx_metadata_dqo(skb, &metadata);
814 	if (is_gso) {
815 		int header_len = gve_prep_tso(skb);
816 
817 		if (unlikely(header_len < 0))
818 			goto err;
819 
820 		gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx,
821 					 skb, &metadata, header_len);
822 		desc_idx = (desc_idx + 1) & tx->mask;
823 	}
824 
825 	gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx,
826 				     &metadata);
827 	desc_idx = (desc_idx + 1) & tx->mask;
828 
829 	if (tx->dqo.qpl) {
830 		if (gve_tx_add_skb_copy_dqo(tx, skb, pkt,
831 					    completion_tag,
832 					    &desc_idx, is_gso))
833 			goto err;
834 	}  else {
835 		if (gve_tx_add_skb_no_copy_dqo(tx, skb, pkt,
836 					       completion_tag,
837 					       &desc_idx, is_gso))
838 			goto err;
839 	}
840 
841 	tx->dqo_tx.posted_packet_desc_cnt += pkt->num_bufs;
842 
843 	gve_tx_update_tail(tx, desc_idx);
844 	return 0;
845 
846 err:
847 	pkt->skb = NULL;
848 	gve_free_pending_packet(tx, pkt);
849 
850 	return -1;
851 }
852 
853 static int gve_num_descs_per_buf(size_t size)
854 {
855 	return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO);
856 }
857 
858 static int gve_num_buffer_descs_needed(const struct sk_buff *skb)
859 {
860 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
861 	int num_descs;
862 	int i;
863 
864 	num_descs = gve_num_descs_per_buf(skb_headlen(skb));
865 
866 	for (i = 0; i < shinfo->nr_frags; i++) {
867 		unsigned int frag_size = skb_frag_size(&shinfo->frags[i]);
868 
869 		num_descs += gve_num_descs_per_buf(frag_size);
870 	}
871 
872 	return num_descs;
873 }
874 
875 /* Returns true if HW is capable of sending TSO represented by `skb`.
876  *
877  * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers.
878  * - The header is counted as one buffer for every single segment.
879  * - A buffer which is split between two segments is counted for both.
880  * - If a buffer contains both header and payload, it is counted as two buffers.
881  */
882 static bool gve_can_send_tso(const struct sk_buff *skb)
883 {
884 	const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1;
885 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
886 	const int header_len = skb_tcp_all_headers(skb);
887 	const int gso_size = shinfo->gso_size;
888 	int cur_seg_num_bufs;
889 	int prev_frag_size;
890 	int cur_seg_size;
891 	int i;
892 
893 	cur_seg_size = skb_headlen(skb) - header_len;
894 	prev_frag_size = skb_headlen(skb);
895 	cur_seg_num_bufs = cur_seg_size > 0;
896 
897 	for (i = 0; i < shinfo->nr_frags; i++) {
898 		if (cur_seg_size >= gso_size) {
899 			cur_seg_size %= gso_size;
900 			cur_seg_num_bufs = cur_seg_size > 0;
901 
902 			if (prev_frag_size > GVE_TX_MAX_BUF_SIZE_DQO) {
903 				int prev_frag_remain = prev_frag_size %
904 					GVE_TX_MAX_BUF_SIZE_DQO;
905 
906 				/* If the last descriptor of the previous frag
907 				 * is less than cur_seg_size, the segment will
908 				 * span two descriptors in the previous frag.
909 				 * Since max gso size (9728) is less than
910 				 * GVE_TX_MAX_BUF_SIZE_DQO, it is impossible
911 				 * for the segment to span more than two
912 				 * descriptors.
913 				 */
914 				if (prev_frag_remain &&
915 				    cur_seg_size > prev_frag_remain)
916 					cur_seg_num_bufs++;
917 			}
918 		}
919 
920 		if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg))
921 			return false;
922 
923 		prev_frag_size = skb_frag_size(&shinfo->frags[i]);
924 		cur_seg_size += prev_frag_size;
925 	}
926 
927 	return true;
928 }
929 
930 netdev_features_t gve_features_check_dqo(struct sk_buff *skb,
931 					 struct net_device *dev,
932 					 netdev_features_t features)
933 {
934 	if (skb_is_gso(skb) && !gve_can_send_tso(skb))
935 		return features & ~NETIF_F_GSO_MASK;
936 
937 	return features;
938 }
939 
940 /* Attempt to transmit specified SKB.
941  *
942  * Returns 0 if the SKB was transmitted or dropped.
943  * Returns -1 if there is not currently enough space to transmit the SKB.
944  */
945 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
946 			  struct sk_buff *skb)
947 {
948 	int num_buffer_descs;
949 	int total_num_descs;
950 
951 	if (skb_is_gso(skb) && unlikely(ipv6_hopopt_jumbo_remove(skb)))
952 		goto drop;
953 
954 	if (tx->dqo.qpl) {
955 		/* We do not need to verify the number of buffers used per
956 		 * packet or per segment in case of TSO as with 2K size buffers
957 		 * none of the TX packet rules would be violated.
958 		 *
959 		 * gve_can_send_tso() checks that each TCP segment of gso_size is
960 		 * not distributed over more than 9 SKB frags..
961 		 */
962 		num_buffer_descs = DIV_ROUND_UP(skb->len, GVE_TX_BUF_SIZE_DQO);
963 	} else {
964 		num_buffer_descs = gve_num_buffer_descs_needed(skb);
965 		if (!skb_is_gso(skb)) {
966 			if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) {
967 				if (unlikely(skb_linearize(skb) < 0))
968 					goto drop;
969 
970 				num_buffer_descs = 1;
971 			}
972 		}
973 	}
974 
975 	/* Metadata + (optional TSO) + data descriptors. */
976 	total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs;
977 	if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs,
978 					   num_buffer_descs))) {
979 		return -1;
980 	}
981 
982 	if (unlikely(gve_tx_add_skb_dqo(tx, skb) < 0))
983 		goto drop;
984 
985 	netdev_tx_sent_queue(tx->netdev_txq, skb->len);
986 	skb_tx_timestamp(skb);
987 	return 0;
988 
989 drop:
990 	tx->dropped_pkt++;
991 	dev_kfree_skb_any(skb);
992 	return 0;
993 }
994 
995 /* Transmit a given skb and ring the doorbell. */
996 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
997 {
998 	struct gve_priv *priv = netdev_priv(dev);
999 	struct gve_tx_ring *tx;
1000 
1001 	tx = &priv->tx[skb_get_queue_mapping(skb)];
1002 	if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) {
1003 		/* We need to ring the txq doorbell -- we have stopped the Tx
1004 		 * queue for want of resources, but prior calls to gve_tx()
1005 		 * may have added descriptors without ringing the doorbell.
1006 		 */
1007 		gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
1008 		return NETDEV_TX_BUSY;
1009 	}
1010 
1011 	if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
1012 		return NETDEV_TX_OK;
1013 
1014 	gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
1015 	return NETDEV_TX_OK;
1016 }
1017 
1018 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list,
1019 			struct gve_tx_pending_packet_dqo *pending_packet)
1020 {
1021 	s16 old_tail, index;
1022 
1023 	index = pending_packet - tx->dqo.pending_packets;
1024 	old_tail = list->tail;
1025 	list->tail = index;
1026 	if (old_tail == -1)
1027 		list->head = index;
1028 	else
1029 		tx->dqo.pending_packets[old_tail].next = index;
1030 
1031 	pending_packet->next = -1;
1032 	pending_packet->prev = old_tail;
1033 }
1034 
1035 static void remove_from_list(struct gve_tx_ring *tx,
1036 			     struct gve_index_list *list,
1037 			     struct gve_tx_pending_packet_dqo *pkt)
1038 {
1039 	s16 prev_index, next_index;
1040 
1041 	prev_index = pkt->prev;
1042 	next_index = pkt->next;
1043 
1044 	if (prev_index == -1) {
1045 		/* Node is head */
1046 		list->head = next_index;
1047 	} else {
1048 		tx->dqo.pending_packets[prev_index].next = next_index;
1049 	}
1050 	if (next_index == -1) {
1051 		/* Node is tail */
1052 		list->tail = prev_index;
1053 	} else {
1054 		tx->dqo.pending_packets[next_index].prev = prev_index;
1055 	}
1056 }
1057 
1058 static void gve_unmap_packet(struct device *dev,
1059 			     struct gve_tx_pending_packet_dqo *pkt)
1060 {
1061 	int i;
1062 
1063 	/* SKB linear portion is guaranteed to be mapped */
1064 	dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]),
1065 			 dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE);
1066 	for (i = 1; i < pkt->num_bufs; i++) {
1067 		netmem_dma_unmap_page_attrs(dev, dma_unmap_addr(pkt, dma[i]),
1068 					    dma_unmap_len(pkt, len[i]),
1069 					    DMA_TO_DEVICE, 0);
1070 	}
1071 	pkt->num_bufs = 0;
1072 }
1073 
1074 /* Completion types and expected behavior:
1075  * No Miss compl + Packet compl = Packet completed normally.
1076  * Miss compl + Re-inject compl = Packet completed normally.
1077  * No Miss compl + Re-inject compl = Skipped i.e. packet not completed.
1078  * Miss compl + Packet compl = Skipped i.e. packet not completed.
1079  */
1080 static void gve_handle_packet_completion(struct gve_priv *priv,
1081 					 struct gve_tx_ring *tx, bool is_napi,
1082 					 u16 compl_tag, u64 *bytes, u64 *pkts,
1083 					 bool is_reinjection)
1084 {
1085 	struct gve_tx_pending_packet_dqo *pending_packet;
1086 
1087 	if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
1088 		net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
1089 				    priv->dev->name, (int)compl_tag);
1090 		return;
1091 	}
1092 
1093 	pending_packet = &tx->dqo.pending_packets[compl_tag];
1094 
1095 	if (unlikely(is_reinjection)) {
1096 		if (unlikely(pending_packet->state ==
1097 			     GVE_PACKET_STATE_TIMED_OUT_COMPL)) {
1098 			net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n",
1099 					    priv->dev->name, (int)compl_tag);
1100 			/* Packet was already completed as a result of timeout,
1101 			 * so just remove from list and free pending packet.
1102 			 */
1103 			remove_from_list(tx,
1104 					 &tx->dqo_compl.timed_out_completions,
1105 					 pending_packet);
1106 			gve_free_pending_packet(tx, pending_packet);
1107 			return;
1108 		}
1109 		if (unlikely(pending_packet->state !=
1110 			     GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) {
1111 			/* No outstanding miss completion but packet allocated
1112 			 * implies packet receives a re-injection completion
1113 			 * without a prior miss completion. Return without
1114 			 * completing the packet.
1115 			 */
1116 			net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n",
1117 					    priv->dev->name, (int)compl_tag);
1118 			return;
1119 		}
1120 		remove_from_list(tx, &tx->dqo_compl.miss_completions,
1121 				 pending_packet);
1122 	} else {
1123 		/* Packet is allocated but not a pending data completion. */
1124 		if (unlikely(pending_packet->state !=
1125 			     GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
1126 			net_err_ratelimited("%s: No pending data completion: %d\n",
1127 					    priv->dev->name, (int)compl_tag);
1128 			return;
1129 		}
1130 	}
1131 	tx->dqo_tx.completed_packet_desc_cnt += pending_packet->num_bufs;
1132 
1133 	switch (pending_packet->type) {
1134 	case GVE_TX_PENDING_PACKET_DQO_SKB:
1135 		if (tx->dqo.qpl)
1136 			gve_free_tx_qpl_bufs(tx, pending_packet);
1137 		else
1138 			gve_unmap_packet(tx->dev, pending_packet);
1139 		(*pkts)++;
1140 		*bytes += pending_packet->skb->len;
1141 
1142 		napi_consume_skb(pending_packet->skb, is_napi);
1143 		pending_packet->skb = NULL;
1144 		gve_free_pending_packet(tx, pending_packet);
1145 		break;
1146 	case GVE_TX_PENDING_PACKET_DQO_XDP_FRAME:
1147 		gve_unmap_packet(tx->dev, pending_packet);
1148 		(*pkts)++;
1149 		*bytes += pending_packet->xdpf->len;
1150 
1151 		xdp_return_frame(pending_packet->xdpf);
1152 		pending_packet->xdpf = NULL;
1153 		gve_free_pending_packet(tx, pending_packet);
1154 		break;
1155 	default:
1156 		WARN_ON_ONCE(1);
1157 	}
1158 }
1159 
1160 static void gve_handle_miss_completion(struct gve_priv *priv,
1161 				       struct gve_tx_ring *tx, u16 compl_tag,
1162 				       u64 *bytes, u64 *pkts)
1163 {
1164 	struct gve_tx_pending_packet_dqo *pending_packet;
1165 
1166 	if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
1167 		net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
1168 				    priv->dev->name, (int)compl_tag);
1169 		return;
1170 	}
1171 
1172 	pending_packet = &tx->dqo.pending_packets[compl_tag];
1173 	if (unlikely(pending_packet->state !=
1174 				GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
1175 		net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n",
1176 				    priv->dev->name, (int)pending_packet->state,
1177 				    (int)compl_tag);
1178 		return;
1179 	}
1180 
1181 	pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL;
1182 	/* jiffies can wraparound but time comparisons can handle overflows. */
1183 	pending_packet->timeout_jiffies =
1184 			jiffies +
1185 			secs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT);
1186 	add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet);
1187 
1188 	*bytes += pending_packet->skb->len;
1189 	(*pkts)++;
1190 }
1191 
1192 static void remove_miss_completions(struct gve_priv *priv,
1193 				    struct gve_tx_ring *tx)
1194 {
1195 	struct gve_tx_pending_packet_dqo *pending_packet;
1196 	s16 next_index;
1197 
1198 	next_index = tx->dqo_compl.miss_completions.head;
1199 	while (next_index != -1) {
1200 		pending_packet = &tx->dqo.pending_packets[next_index];
1201 		next_index = pending_packet->next;
1202 		/* Break early because packets should timeout in order. */
1203 		if (time_is_after_jiffies(pending_packet->timeout_jiffies))
1204 			break;
1205 
1206 		remove_from_list(tx, &tx->dqo_compl.miss_completions,
1207 				 pending_packet);
1208 		/* Unmap/free TX buffers and free skb but do not unallocate packet i.e.
1209 		 * the completion tag is not freed to ensure that the driver
1210 		 * can take appropriate action if a corresponding valid
1211 		 * completion is received later.
1212 		 */
1213 		if (tx->dqo.qpl)
1214 			gve_free_tx_qpl_bufs(tx, pending_packet);
1215 		else
1216 			gve_unmap_packet(tx->dev, pending_packet);
1217 
1218 		/* This indicates the packet was dropped. */
1219 		dev_kfree_skb_any(pending_packet->skb);
1220 		pending_packet->skb = NULL;
1221 		tx->dropped_pkt++;
1222 		net_err_ratelimited("%s: No reinjection completion was received for: %d.\n",
1223 				    priv->dev->name,
1224 				    (int)(pending_packet - tx->dqo.pending_packets));
1225 
1226 		pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL;
1227 		pending_packet->timeout_jiffies =
1228 				jiffies +
1229 				secs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT);
1230 		/* Maintain pending packet in another list so the packet can be
1231 		 * unallocated at a later time.
1232 		 */
1233 		add_to_list(tx, &tx->dqo_compl.timed_out_completions,
1234 			    pending_packet);
1235 	}
1236 }
1237 
1238 static void remove_timed_out_completions(struct gve_priv *priv,
1239 					 struct gve_tx_ring *tx)
1240 {
1241 	struct gve_tx_pending_packet_dqo *pending_packet;
1242 	s16 next_index;
1243 
1244 	next_index = tx->dqo_compl.timed_out_completions.head;
1245 	while (next_index != -1) {
1246 		pending_packet = &tx->dqo.pending_packets[next_index];
1247 		next_index = pending_packet->next;
1248 		/* Break early because packets should timeout in order. */
1249 		if (time_is_after_jiffies(pending_packet->timeout_jiffies))
1250 			break;
1251 
1252 		remove_from_list(tx, &tx->dqo_compl.timed_out_completions,
1253 				 pending_packet);
1254 		gve_free_pending_packet(tx, pending_packet);
1255 	}
1256 }
1257 
1258 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
1259 			  struct napi_struct *napi)
1260 {
1261 	u64 reinject_compl_bytes = 0;
1262 	u64 reinject_compl_pkts = 0;
1263 	int num_descs_cleaned = 0;
1264 	u64 miss_compl_bytes = 0;
1265 	u64 miss_compl_pkts = 0;
1266 	u64 pkt_compl_bytes = 0;
1267 	u64 pkt_compl_pkts = 0;
1268 
1269 	/* Limit in order to avoid blocking for too long */
1270 	while (!napi || pkt_compl_pkts < napi->weight) {
1271 		struct gve_tx_compl_desc *compl_desc =
1272 			&tx->dqo.compl_ring[tx->dqo_compl.head];
1273 		u16 type;
1274 
1275 		if (compl_desc->generation == tx->dqo_compl.cur_gen_bit)
1276 			break;
1277 
1278 		/* Prefetch the next descriptor. */
1279 		prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) &
1280 				tx->dqo.complq_mask]);
1281 
1282 		/* Do not read data until we own the descriptor */
1283 		dma_rmb();
1284 		type = compl_desc->type;
1285 
1286 		if (type == GVE_COMPL_TYPE_DQO_DESC) {
1287 			/* This is the last descriptor fetched by HW plus one */
1288 			u16 tx_head = le16_to_cpu(compl_desc->tx_head);
1289 
1290 			atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head);
1291 		} else if (type == GVE_COMPL_TYPE_DQO_PKT) {
1292 			u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
1293 			if (compl_tag & GVE_ALT_MISS_COMPL_BIT) {
1294 				compl_tag &= ~GVE_ALT_MISS_COMPL_BIT;
1295 				gve_handle_miss_completion(priv, tx, compl_tag,
1296 							   &miss_compl_bytes,
1297 							   &miss_compl_pkts);
1298 			} else {
1299 				gve_handle_packet_completion(priv, tx, !!napi,
1300 							     compl_tag,
1301 							     &pkt_compl_bytes,
1302 							     &pkt_compl_pkts,
1303 							     false);
1304 			}
1305 		} else if (type == GVE_COMPL_TYPE_DQO_MISS) {
1306 			u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
1307 
1308 			gve_handle_miss_completion(priv, tx, compl_tag,
1309 						   &miss_compl_bytes,
1310 						   &miss_compl_pkts);
1311 		} else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) {
1312 			u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
1313 
1314 			gve_handle_packet_completion(priv, tx, !!napi,
1315 						     compl_tag,
1316 						     &reinject_compl_bytes,
1317 						     &reinject_compl_pkts,
1318 						     true);
1319 		}
1320 
1321 		tx->dqo_compl.head =
1322 			(tx->dqo_compl.head + 1) & tx->dqo.complq_mask;
1323 		/* Flip the generation bit when we wrap around */
1324 		tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0;
1325 		num_descs_cleaned++;
1326 	}
1327 
1328 	if (tx->netdev_txq)
1329 		netdev_tx_completed_queue(tx->netdev_txq,
1330 					  pkt_compl_pkts + miss_compl_pkts,
1331 					  pkt_compl_bytes + miss_compl_bytes);
1332 
1333 	remove_miss_completions(priv, tx);
1334 	remove_timed_out_completions(priv, tx);
1335 
1336 	u64_stats_update_begin(&tx->statss);
1337 	tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes;
1338 	tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts;
1339 	u64_stats_update_end(&tx->statss);
1340 	return num_descs_cleaned;
1341 }
1342 
1343 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean)
1344 {
1345 	struct gve_tx_compl_desc *compl_desc;
1346 	struct gve_tx_ring *tx = block->tx;
1347 	struct gve_priv *priv = block->priv;
1348 
1349 	if (do_clean) {
1350 		int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx,
1351 							      &block->napi);
1352 
1353 		/* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */
1354 		mb();
1355 
1356 		if (netif_tx_queue_stopped(tx->netdev_txq) &&
1357 		    num_descs_cleaned > 0) {
1358 			tx->wake_queue++;
1359 			netif_tx_wake_queue(tx->netdev_txq);
1360 		}
1361 	}
1362 
1363 	/* Return true if we still have work. */
1364 	compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head];
1365 	return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
1366 }
1367 
1368 bool gve_xdp_poll_dqo(struct gve_notify_block *block)
1369 {
1370 	struct gve_tx_compl_desc *compl_desc;
1371 	struct gve_tx_ring *tx = block->tx;
1372 	struct gve_priv *priv = block->priv;
1373 
1374 	gve_clean_tx_done_dqo(priv, tx, &block->napi);
1375 
1376 	/* Return true if we still have work. */
1377 	compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head];
1378 	return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
1379 }
1380 
1381 int gve_xdp_xmit_one_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
1382 			 struct xdp_frame *xdpf)
1383 {
1384 	struct gve_tx_pending_packet_dqo *pkt;
1385 	u32 desc_idx = tx->dqo_tx.tail;
1386 	s16 completion_tag;
1387 	int num_descs = 1;
1388 	dma_addr_t addr;
1389 	int err;
1390 
1391 	if (unlikely(!gve_has_tx_slots_available(tx, num_descs)))
1392 		return -EBUSY;
1393 
1394 	pkt = gve_alloc_pending_packet(tx);
1395 	if (unlikely(!pkt))
1396 		return -EBUSY;
1397 
1398 	pkt->type = GVE_TX_PENDING_PACKET_DQO_XDP_FRAME;
1399 	pkt->num_bufs = 0;
1400 	pkt->xdpf = xdpf;
1401 	completion_tag = pkt - tx->dqo.pending_packets;
1402 
1403 	/* Generate Packet Descriptor */
1404 	addr = dma_map_single(tx->dev, xdpf->data, xdpf->len, DMA_TO_DEVICE);
1405 	err = dma_mapping_error(tx->dev, addr);
1406 	if (unlikely(err))
1407 		goto err;
1408 
1409 	dma_unmap_len_set(pkt, len[pkt->num_bufs], xdpf->len);
1410 	dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
1411 	pkt->num_bufs++;
1412 
1413 	gve_tx_fill_pkt_desc_dqo(tx, &desc_idx,
1414 				 false, xdpf->len,
1415 				 addr, completion_tag, true,
1416 				 false);
1417 
1418 	gve_tx_update_tail(tx, desc_idx);
1419 	return 0;
1420 
1421 err:
1422 	pkt->xdpf = NULL;
1423 	pkt->num_bufs = 0;
1424 	gve_free_pending_packet(tx, pkt);
1425 	return err;
1426 }
1427 
1428 int gve_xdp_xmit_dqo(struct net_device *dev, int n, struct xdp_frame **frames,
1429 		     u32 flags)
1430 {
1431 	struct gve_priv *priv = netdev_priv(dev);
1432 	struct gve_tx_ring *tx;
1433 	int i, err = 0, qid;
1434 
1435 	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
1436 		return -EINVAL;
1437 
1438 	qid = gve_xdp_tx_queue_id(priv,
1439 				  smp_processor_id() % priv->tx_cfg.num_xdp_queues);
1440 
1441 	tx = &priv->tx[qid];
1442 
1443 	spin_lock(&tx->dqo_tx.xdp_lock);
1444 	for (i = 0; i < n; i++) {
1445 		err = gve_xdp_xmit_one_dqo(priv, tx, frames[i]);
1446 		if (err)
1447 			break;
1448 	}
1449 
1450 	if (flags & XDP_XMIT_FLUSH)
1451 		gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
1452 
1453 	spin_unlock(&tx->dqo_tx.xdp_lock);
1454 
1455 	u64_stats_update_begin(&tx->statss);
1456 	tx->xdp_xmit += n;
1457 	tx->xdp_xmit_errors += n - i;
1458 	u64_stats_update_end(&tx->statss);
1459 
1460 	return i ? i : err;
1461 }
1462