xref: /freebsd/sys/dev/gve/gve_rx.c (revision 5ca8c28cd8c725b81781201cfdb5f9969396f934)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2023 Google LLC
5  *
6  * Redistribution and use in source and binary forms, with or without modification,
7  * are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * 3. Neither the name of the copyright holder nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software without
18  *    specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
24  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
27  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 #include "gve.h"
32 #include "gve_adminq.h"
33 
34 static void
35 gve_rx_free_ring(struct gve_priv *priv, int i)
36 {
37 	struct gve_rx_ring *rx = &priv->rx[i];
38 	struct gve_ring_com *com = &rx->com;
39 
40         /* Safe to call even if never allocated */
41 	gve_free_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS);
42 
43 	if (rx->page_info != NULL) {
44 		free(rx->page_info, M_GVE);
45 		rx->page_info = NULL;
46 	}
47 
48 	if (rx->data_ring != NULL) {
49 		gve_dma_free_coherent(&rx->data_ring_mem);
50 		rx->data_ring = NULL;
51 	}
52 
53 	if (rx->desc_ring != NULL) {
54 		gve_dma_free_coherent(&rx->desc_ring_mem);
55 		rx->desc_ring = NULL;
56 	}
57 
58 	if (com->q_resources != NULL) {
59 		gve_dma_free_coherent(&com->q_resources_mem);
60 		com->q_resources = NULL;
61 	}
62 }
63 
64 static void
65 gve_prefill_rx_slots(struct gve_rx_ring *rx)
66 {
67 	struct gve_ring_com *com = &rx->com;
68 	struct gve_dma_handle *dma;
69 	int i;
70 
71 	for (i = 0; i < com->priv->rx_desc_cnt; i++) {
72 		rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i);
73 		rx->page_info[i].page_offset = 0;
74 		rx->page_info[i].page_address = com->qpl->dmas[i].cpu_addr;
75 		rx->page_info[i].page = com->qpl->pages[i];
76 
77 		dma = &com->qpl->dmas[i];
78 		bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREREAD);
79 	}
80 
81 	bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map,
82 	    BUS_DMASYNC_PREWRITE);
83 }
84 
85 static int
86 gve_rx_alloc_ring(struct gve_priv *priv, int i)
87 {
88 	struct gve_rx_ring *rx = &priv->rx[i];
89 	struct gve_ring_com *com = &rx->com;
90 	int err;
91 
92 	com->priv = priv;
93 	com->id = i;
94 
95 	rx->mask = priv->rx_pages_per_qpl - 1;
96 
97 	com->qpl = &priv->qpls[priv->tx_cfg.max_queues + i];
98 	if (com->qpl == NULL) {
99 		device_printf(priv->dev, "No QPL left for rx ring %d", i);
100 		return (ENOMEM);
101 	}
102 
103 	rx->page_info = malloc(priv->rx_desc_cnt * sizeof(*rx->page_info), M_GVE,
104 	    M_WAITOK | M_ZERO);
105 
106 	gve_alloc_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS);
107 
108 	err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources),
109 	    PAGE_SIZE, &com->q_resources_mem);
110 	if (err != 0) {
111 		device_printf(priv->dev, "Failed to alloc queue resources for rx ring %d", i);
112 		goto abort;
113 	}
114 	com->q_resources = com->q_resources_mem.cpu_addr;
115 
116 	err = gve_dma_alloc_coherent(priv,
117 	    sizeof(struct gve_rx_desc) * priv->rx_desc_cnt,
118 	    CACHE_LINE_SIZE, &rx->desc_ring_mem);
119 	if (err != 0) {
120 		device_printf(priv->dev, "Failed to alloc desc ring for rx ring %d", i);
121 		goto abort;
122 	}
123 	rx->desc_ring = rx->desc_ring_mem.cpu_addr;
124 
125 	err = gve_dma_alloc_coherent(priv,
126 	    sizeof(union gve_rx_data_slot) * priv->rx_desc_cnt,
127 	    CACHE_LINE_SIZE, &rx->data_ring_mem);
128 	if (err != 0) {
129 		device_printf(priv->dev, "Failed to alloc data ring for rx ring %d", i);
130 		goto abort;
131 	}
132 	rx->data_ring = rx->data_ring_mem.cpu_addr;
133 
134 	gve_prefill_rx_slots(rx);
135 	return (0);
136 
137 abort:
138 	gve_rx_free_ring(priv, i);
139 	return (err);
140 }
141 
142 int
143 gve_alloc_rx_rings(struct gve_priv *priv)
144 {
145 	int err = 0;
146 	int i;
147 
148 	priv->rx = malloc(sizeof(struct gve_rx_ring) * priv->rx_cfg.num_queues,
149 	    M_GVE, M_WAITOK | M_ZERO);
150 
151 	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
152 		err = gve_rx_alloc_ring(priv, i);
153 		if (err != 0)
154 			goto free_rings;
155 	}
156 
157 	return (0);
158 
159 free_rings:
160 	while (i--)
161 		gve_rx_free_ring(priv, i);
162 	free(priv->rx, M_GVE);
163 	return (err);
164 }
165 
166 void
167 gve_free_rx_rings(struct gve_priv *priv)
168 {
169 	int i;
170 
171 	for (i = 0; i < priv->rx_cfg.num_queues; i++)
172 		gve_rx_free_ring(priv, i);
173 
174 	free(priv->rx, M_GVE);
175 }
176 
177 static void
178 gve_rx_clear_data_ring(struct gve_rx_ring *rx)
179 {
180 	struct gve_priv *priv = rx->com.priv;
181 	int i;
182 
183 	/*
184 	 * The Rx data ring has this invariant: "the networking stack is not
185 	 * using the buffer beginning at any page_offset". This invariant is
186 	 * established initially by gve_prefill_rx_slots at alloc-time and is
187 	 * maintained by the cleanup taskqueue. This invariant implies that the
188 	 * ring can be considered to be fully posted with buffers at this point,
189 	 * even if there are unfreed mbufs still being processed, which is why we
190 	 * can fill the ring without waiting on can_flip at each slot to become true.
191 	 */
192 	for (i = 0; i < priv->rx_desc_cnt; i++) {
193 		rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i +
194 		    rx->page_info[i].page_offset);
195 		rx->fill_cnt++;
196 	}
197 
198 	bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map,
199 	    BUS_DMASYNC_PREWRITE);
200 }
201 
202 static void
203 gve_rx_clear_desc_ring(struct gve_rx_ring *rx)
204 {
205 	struct gve_priv *priv = rx->com.priv;
206 	int i;
207 
208 	for (i = 0; i < priv->rx_desc_cnt; i++)
209 		rx->desc_ring[i] = (struct gve_rx_desc){};
210 
211 	bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map,
212 	    BUS_DMASYNC_PREWRITE);
213 }
214 
215 static void
216 gve_clear_rx_ring(struct gve_priv *priv, int i)
217 {
218 	struct gve_rx_ring *rx = &priv->rx[i];
219 
220 	rx->seq_no = 1;
221 	rx->cnt = 0;
222 	rx->fill_cnt = 0;
223 	rx->mask = priv->rx_desc_cnt - 1;
224 
225 	gve_rx_clear_desc_ring(rx);
226 	gve_rx_clear_data_ring(rx);
227 }
228 
229 static void
230 gve_start_rx_ring(struct gve_priv *priv, int i)
231 {
232 	struct gve_rx_ring *rx = &priv->rx[i];
233 	struct gve_ring_com *com = &rx->com;
234 
235 	if ((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) {
236 		if (tcp_lro_init(&rx->lro) != 0)
237 			device_printf(priv->dev, "Failed to init lro for rx ring %d", i);
238 		rx->lro.ifp = priv->ifp;
239 	}
240 
241 	NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq, rx);
242 	com->cleanup_tq = taskqueue_create_fast("gve rx", M_WAITOK,
243 	    taskqueue_thread_enqueue, &com->cleanup_tq);
244 
245 	taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET,
246 	    "%s rxq %d", device_get_nameunit(priv->dev), i);
247 
248 	gve_db_bar_write_4(priv, com->db_offset, rx->fill_cnt);
249 }
250 
251 int
252 gve_create_rx_rings(struct gve_priv *priv)
253 {
254 	struct gve_ring_com *com;
255 	struct gve_rx_ring *rx;
256 	int err;
257 	int i;
258 
259 	if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK))
260 		return (0);
261 
262 	for (i = 0; i < priv->rx_cfg.num_queues; i++)
263 		gve_clear_rx_ring(priv, i);
264 
265 	err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues);
266 	if (err != 0)
267 		return (err);
268 
269 	bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map,
270 	    BUS_DMASYNC_POSTREAD);
271 
272 	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
273 		rx = &priv->rx[i];
274 		com = &rx->com;
275 
276 		com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index);
277 
278 		bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map,
279 		    BUS_DMASYNC_POSTREAD);
280 		com->db_offset = 4 * be32toh(com->q_resources->db_index);
281 		com->counter_idx = be32toh(com->q_resources->counter_index);
282 
283 		gve_start_rx_ring(priv, i);
284 	}
285 
286 	gve_set_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK);
287 	return (0);
288 }
289 
290 static void
291 gve_stop_rx_ring(struct gve_priv *priv, int i)
292 {
293 	struct gve_rx_ring *rx = &priv->rx[i];
294 	struct gve_ring_com *com = &rx->com;
295 
296 	if (com->cleanup_tq != NULL) {
297 		taskqueue_quiesce(com->cleanup_tq);
298 		taskqueue_free(com->cleanup_tq);
299 		com->cleanup_tq = NULL;
300 	}
301 
302 	tcp_lro_free(&rx->lro);
303 	rx->ctx = (struct gve_rx_ctx){};
304 }
305 
306 int
307 gve_destroy_rx_rings(struct gve_priv *priv)
308 {
309 	int err;
310 	int i;
311 
312 	for (i = 0; i < priv->rx_cfg.num_queues; i++)
313 		gve_stop_rx_ring(priv, i);
314 
315 	if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK)) {
316 		err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues);
317 		if (err != 0)
318 			return (err);
319 		gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK);
320 	}
321 
322 	return (0);
323 }
324 
325 int
326 gve_rx_intr(void *arg)
327 {
328 	struct gve_rx_ring *rx = arg;
329 	struct gve_priv *priv = rx->com.priv;
330 	struct gve_ring_com *com = &rx->com;
331 
332 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
333 		return (FILTER_STRAY);
334 
335 	gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK);
336 	taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task);
337 	return (FILTER_HANDLED);
338 }
339 
340 static inline void
341 gve_set_rss_type(__be16 flag, struct mbuf *mbuf)
342 {
343 	if ((flag & GVE_RXF_IPV4) != 0) {
344 		if ((flag & GVE_RXF_TCP) != 0)
345 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4);
346 		else if ((flag & GVE_RXF_UDP) != 0)
347 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4);
348 		else
349 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4);
350 		return;
351 	}
352 
353 	if ((flag & GVE_RXF_IPV6) != 0) {
354 		if ((flag & GVE_RXF_TCP) != 0)
355 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6);
356 		else if ((flag & GVE_RXF_UDP) != 0)
357 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6);
358 		else
359 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6);
360 		return;
361 	}
362 }
363 
364 static void
365 gve_mextadd_free(struct mbuf *mbuf)
366 {
367 	vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1;
368 	vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2;
369 
370 	/*
371 	 * Free the page only if this is the last ref.
372 	 * The interface might no longer exist by the time
373 	 * this callback is called, see gve_free_qpl.
374 	 */
375 	if (__predict_false(vm_page_unwire_noq(page))) {
376 		pmap_qremove(va, 1);
377 		kva_free(va, PAGE_SIZE);
378 		vm_page_free(page);
379 	}
380 }
381 
382 static void
383 gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr)
384 {
385 	const __be64 offset = htobe64(GVE_DEFAULT_RX_BUFFER_OFFSET);
386 	page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET;
387 	*(slot_addr) ^= offset;
388 }
389 
390 static struct mbuf *
391 gve_rx_create_mbuf(struct gve_priv *priv, struct gve_rx_ring *rx,
392     struct gve_rx_slot_page_info *page_info, uint16_t len,
393     union gve_rx_data_slot *data_slot, bool is_only_frag)
394 {
395 	struct gve_rx_ctx *ctx = &rx->ctx;
396 	struct mbuf *mbuf;
397 	u_int ref_count;
398 	bool can_flip;
399 
400 	uint32_t offset = page_info->page_offset + page_info->pad;
401 	void *va = (char *)page_info->page_address + offset;
402 
403 	if (len <= priv->rx_copybreak && is_only_frag) {
404 		mbuf = m_get2(len, M_NOWAIT, MT_DATA, M_PKTHDR);
405 		if (__predict_false(mbuf == NULL))
406 			return (NULL);
407 
408 		m_copyback(mbuf, 0, len, va);
409 		counter_enter();
410 		counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1);
411 		counter_exit();
412 		ctx->mbuf_head = mbuf;
413 		ctx->mbuf_tail = mbuf;
414 	} else {
415 		struct mbuf *mbuf_tail = ctx->mbuf_tail;
416 		KASSERT(len <= MCLBYTES, ("gve rx fragment bigger than cluster mbuf"));
417 
418 		/*
419 		 * This page was created with VM_ALLOC_WIRED, thus the lowest
420 		 * wire count experienced by the page until the interface is
421 		 * destroyed is 1.
422 		 *
423 		 * We wire the page again before supplying an mbuf pointing to
424 		 * it to the networking stack, so before the mbuf leaves the
425 		 * driver, the wire count rises to 2.
426 		 *
427 		 * If it is 1 again, it necessarily means that the mbuf has been
428 		 * consumed and it was gve_mextadd_free that brought down the wire
429 		 * count back to 1. We only need to eventually observe the 1.
430 		 */
431 		ref_count = atomic_load_int(&page_info->page->ref_count);
432 		can_flip = VPRC_WIRE_COUNT(ref_count) == 1;
433 
434 		if (mbuf_tail == NULL) {
435 			if (can_flip)
436 				mbuf = m_gethdr(M_NOWAIT, MT_DATA);
437 			else
438 				mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
439 
440 			ctx->mbuf_head = mbuf;
441 			ctx->mbuf_tail = mbuf;
442 		} else {
443 			if (can_flip)
444 				mbuf = m_get(M_NOWAIT, MT_DATA);
445 			else
446 				mbuf = m_getcl(M_NOWAIT, MT_DATA, 0);
447 
448 			mbuf_tail->m_next = mbuf;
449 			ctx->mbuf_tail = mbuf;
450 		}
451 
452 		if (__predict_false(mbuf == NULL))
453 			return (NULL);
454 
455 		if (can_flip) {
456 			MEXTADD(mbuf, va, len, gve_mextadd_free,
457 			    page_info->page, page_info->page_address,
458 			    0, EXT_NET_DRV);
459 
460 			counter_enter();
461 			counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1);
462 			counter_exit();
463 
464 			/*
465 			 * Grab an extra ref to the page so that gve_mextadd_free
466 			 * does not end up freeing the page while the interface exists.
467 			 */
468 			vm_page_wire(page_info->page);
469 
470 			gve_rx_flip_buff(page_info, &data_slot->qpl_offset);
471 		} else {
472 			m_copyback(mbuf, 0, len, va);
473 			counter_enter();
474 			counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1);
475 			counter_exit();
476 		}
477 	}
478 
479 	mbuf->m_len = len;
480 	ctx->total_size += len;
481 
482 	return (mbuf);
483 }
484 
485 static inline bool
486 gve_needs_rss(__be16 flag)
487 {
488 	if ((flag & GVE_RXF_FRAG) != 0)
489 		return (false);
490 	if ((flag & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) != 0)
491 		return (true);
492 	return (false);
493 }
494 
495 static void
496 gve_rx(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_desc *desc,
497     uint32_t idx)
498 {
499 	struct gve_rx_slot_page_info *page_info;
500 	struct gve_dma_handle *page_dma_handle;
501 	union gve_rx_data_slot *data_slot;
502 	struct gve_rx_ctx *ctx = &rx->ctx;
503 	struct mbuf *mbuf = NULL;
504 	if_t ifp = priv->ifp;
505 	bool do_if_input;
506 	uint16_t len;
507 
508 	bool is_first_frag = ctx->frag_cnt == 0;
509 	bool is_last_frag = !(GVE_RXF_PKT_CONT & desc->flags_seq);
510 	bool is_only_frag = is_first_frag && is_last_frag;
511 
512 	if (__predict_false(ctx->drop_pkt))
513 		goto finish_frag;
514 
515 	if ((desc->flags_seq & GVE_RXF_ERR) != 0) {
516 		ctx->drop_pkt = true;
517 		counter_enter();
518 		counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1);
519 		counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1);
520 		counter_exit();
521 		m_freem(ctx->mbuf_head);
522 		goto finish_frag;
523 	}
524 
525 	page_info = &rx->page_info[idx];
526 	data_slot = &rx->data_ring[idx];
527 	page_dma_handle = &(rx->com.qpl->dmas[idx]);
528 
529 	page_info->pad = is_first_frag ? GVE_RX_PAD : 0;
530 	len = be16toh(desc->len) - page_info->pad;
531 
532 	bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map,
533 	    BUS_DMASYNC_POSTREAD);
534 
535 	mbuf = gve_rx_create_mbuf(priv, rx, page_info, len, data_slot,
536 	    is_only_frag);
537 	if (mbuf == NULL) {
538 		ctx->drop_pkt = true;
539 		counter_enter();
540 		counter_u64_add_protected(rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1);
541 		counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1);
542 		counter_exit();
543 		m_freem(ctx->mbuf_head);
544 		goto finish_frag;
545 	}
546 
547 	if (is_first_frag) {
548 		mbuf->m_pkthdr.rcvif = priv->ifp;
549 		ctx->is_tcp = desc->flags_seq & GVE_RXF_TCP;
550 
551 		if (gve_needs_rss(desc->flags_seq)) {
552 			gve_set_rss_type(desc->flags_seq, mbuf);
553 			mbuf->m_pkthdr.flowid = be32toh(desc->rss_hash);
554 		}
555 
556 		if ((desc->csum != 0) && ((desc->flags_seq & GVE_RXF_FRAG) == 0)) {
557 			mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
558 				                    CSUM_IP_VALID |
559 						    CSUM_DATA_VALID |
560 						    CSUM_PSEUDO_HDR;
561 			mbuf->m_pkthdr.csum_data = 0xffff;
562 		}
563 	}
564 
565 	if (is_last_frag) {
566 		mbuf = ctx->mbuf_head;
567 		mbuf->m_pkthdr.len = ctx->total_size;
568 		do_if_input = true;
569 
570 		if (((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) &&      /* LRO is enabled */
571 		    (ctx->is_tcp) &&                      		    /* pkt is a TCP pkt */
572 		    ((mbuf->m_pkthdr.csum_flags & CSUM_DATA_VALID) != 0) && /* NIC verified csum */
573 		    (rx->lro.lro_cnt != 0) &&                               /* LRO resources exist */
574 		    (tcp_lro_rx(&rx->lro, mbuf, 0) == 0))
575 			do_if_input = false;
576 
577 		if (do_if_input)
578 			if_input(ifp, mbuf);
579 
580 		counter_enter();
581 		counter_u64_add_protected(rx->stats.rbytes, ctx->total_size);
582 		counter_u64_add_protected(rx->stats.rpackets, 1);
583 		counter_exit();
584 	}
585 
586 finish_frag:
587 	ctx->frag_cnt++;
588 	if (is_last_frag)
589 		rx->ctx = (struct gve_rx_ctx){};
590 }
591 
592 static bool
593 gve_rx_work_pending(struct gve_rx_ring *rx)
594 {
595 	struct gve_rx_desc *desc;
596 	__be16 flags_seq;
597 	uint32_t next_idx;
598 
599 	next_idx = rx->cnt & rx->mask;
600 	desc = rx->desc_ring + next_idx;
601 
602 	flags_seq = desc->flags_seq;
603 
604 	return (GVE_SEQNO(flags_seq) == rx->seq_no);
605 }
606 
607 static inline uint8_t
608 gve_next_seqno(uint8_t seq)
609 {
610 	return ((seq + 1) == 8 ? 1 : seq + 1);
611 }
612 
613 static void
614 gve_rx_cleanup(struct gve_priv *priv, struct gve_rx_ring *rx, int budget)
615 {
616 	uint32_t idx = rx->cnt & rx->mask;
617 	struct gve_rx_desc *desc;
618 	struct gve_rx_ctx *ctx = &rx->ctx;
619 	uint32_t work_done = 0;
620 
621 	NET_EPOCH_ASSERT();
622 
623 	bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map,
624 	    BUS_DMASYNC_POSTREAD);
625 	desc = &rx->desc_ring[idx];
626 
627 	while ((work_done < budget || ctx->frag_cnt) &&
628 	    (GVE_SEQNO(desc->flags_seq) == rx->seq_no)) {
629 
630 		gve_rx(priv, rx, desc, idx);
631 
632 		rx->cnt++;
633 		idx = rx->cnt & rx->mask;
634 		desc = &rx->desc_ring[idx];
635 		rx->seq_no = gve_next_seqno(rx->seq_no);
636 		work_done++;
637 	}
638 
639 	/* The device will only send whole packets. */
640 	if (__predict_false(ctx->frag_cnt)) {
641 		m_freem(ctx->mbuf_head);
642 		rx->ctx = (struct gve_rx_ctx){};
643 		device_printf(priv->dev,
644 		    "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset",
645 		    GVE_SEQNO(desc->flags_seq), rx->seq_no);
646 		gve_schedule_reset(priv);
647 	}
648 
649 	if (work_done != 0)
650 		tcp_lro_flush_all(&rx->lro);
651 
652 	bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map,
653 	    BUS_DMASYNC_PREWRITE);
654 
655 	/* Buffers are refilled as the descs are processed */
656 	rx->fill_cnt += work_done;
657 	gve_db_bar_write_4(priv, rx->com.db_offset, rx->fill_cnt);
658 }
659 
660 void
661 gve_rx_cleanup_tq(void *arg, int pending)
662 {
663 	struct gve_rx_ring *rx = arg;
664 	struct gve_priv *priv = rx->com.priv;
665 
666 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
667 		return;
668 
669 	gve_rx_cleanup(priv, rx, /*budget=*/128);
670 
671 	gve_db_bar_write_4(priv, rx->com.irq_db_offset,
672 	    GVE_IRQ_ACK | GVE_IRQ_EVENT);
673 
674 	/*
675 	 * Fragments received before this barrier MAY NOT cause the NIC to send an
676 	 * interrupt but they will still be handled by the enqueue below.
677 	 * Fragments received after the barrier WILL trigger an interrupt.
678 	 */
679 	mb();
680 
681 	if (gve_rx_work_pending(rx)) {
682 		gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK);
683 		taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task);
684 	}
685 }
686