xref: /freebsd/sys/dev/gve/gve_rx.c (revision 5036d9652a5701d00e9e40ea942c278e9f77d33d)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2023-2024 Google LLC
5  *
6  * Redistribution and use in source and binary forms, with or without modification,
7  * are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * 3. Neither the name of the copyright holder nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software without
18  *    specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
24  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
27  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 #include "gve.h"
32 #include "gve_adminq.h"
33 #include "gve_dqo.h"
34 
35 static void
36 gve_rx_free_ring_gqi(struct gve_priv *priv, int i)
37 {
38 	struct gve_rx_ring *rx = &priv->rx[i];
39 
40 	if (rx->page_info != NULL) {
41 		free(rx->page_info, M_GVE);
42 		rx->page_info = NULL;
43 	}
44 
45 	if (rx->data_ring != NULL) {
46 		gve_dma_free_coherent(&rx->data_ring_mem);
47 		rx->data_ring = NULL;
48 	}
49 
50 	if (rx->desc_ring != NULL) {
51 		gve_dma_free_coherent(&rx->desc_ring_mem);
52 		rx->desc_ring = NULL;
53 	}
54 }
55 
56 static void
57 gve_rx_free_ring(struct gve_priv *priv, int i)
58 {
59 	struct gve_rx_ring *rx = &priv->rx[i];
60 	struct gve_ring_com *com = &rx->com;
61 
62         /* Safe to call even if never allocated */
63 	gve_free_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS);
64 
65 	if (gve_is_gqi(priv))
66 		gve_rx_free_ring_gqi(priv, i);
67 	else
68 		gve_rx_free_ring_dqo(priv, i);
69 
70 	if (com->q_resources != NULL) {
71 		gve_dma_free_coherent(&com->q_resources_mem);
72 		com->q_resources = NULL;
73 	}
74 }
75 
76 static void
77 gve_prefill_rx_slots(struct gve_rx_ring *rx)
78 {
79 	struct gve_ring_com *com = &rx->com;
80 	struct gve_dma_handle *dma;
81 	int i;
82 
83 	for (i = 0; i < com->priv->rx_desc_cnt; i++) {
84 		rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i);
85 		rx->page_info[i].page_offset = 0;
86 		rx->page_info[i].page_address = com->qpl->dmas[i].cpu_addr;
87 		rx->page_info[i].page = com->qpl->pages[i];
88 
89 		dma = &com->qpl->dmas[i];
90 		bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREREAD);
91 	}
92 
93 	bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map,
94 	    BUS_DMASYNC_PREWRITE);
95 }
96 
97 static int
98 gve_rx_alloc_ring_gqi(struct gve_priv *priv, int i)
99 {
100 	struct gve_rx_ring *rx = &priv->rx[i];
101 	struct gve_ring_com *com = &rx->com;
102 	int err;
103 
104 	err = gve_dma_alloc_coherent(priv,
105 	    sizeof(struct gve_rx_desc) * priv->rx_desc_cnt,
106 	    CACHE_LINE_SIZE, &rx->desc_ring_mem);
107 	if (err != 0) {
108 		device_printf(priv->dev,
109 		    "Failed to alloc desc ring for rx ring %d", i);
110 		goto abort;
111 	}
112 
113 	rx->mask = priv->rx_pages_per_qpl - 1;
114 	rx->desc_ring = rx->desc_ring_mem.cpu_addr;
115 
116 	com->qpl = &priv->qpls[priv->tx_cfg.max_queues + i];
117 	if (com->qpl == NULL) {
118 		device_printf(priv->dev, "No QPL left for rx ring %d", i);
119 		return (ENOMEM);
120 	}
121 
122 	rx->page_info = malloc(priv->rx_desc_cnt * sizeof(*rx->page_info),
123 	    M_GVE, M_WAITOK | M_ZERO);
124 
125 	err = gve_dma_alloc_coherent(priv,
126 	    sizeof(union gve_rx_data_slot) * priv->rx_desc_cnt,
127 	    CACHE_LINE_SIZE, &rx->data_ring_mem);
128 	if (err != 0) {
129 		device_printf(priv->dev,
130 		    "Failed to alloc data ring for rx ring %d", i);
131 		goto abort;
132 	}
133 	rx->data_ring = rx->data_ring_mem.cpu_addr;
134 
135 	gve_prefill_rx_slots(rx);
136 	return (0);
137 
138 abort:
139 	gve_rx_free_ring_gqi(priv, i);
140 	return (err);
141 }
142 
143 static int
144 gve_rx_alloc_ring(struct gve_priv *priv, int i)
145 {
146 	struct gve_rx_ring *rx = &priv->rx[i];
147 	struct gve_ring_com *com = &rx->com;
148 	int err;
149 
150 	com->priv = priv;
151 	com->id = i;
152 
153 	gve_alloc_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS);
154 
155 	err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources),
156 	    PAGE_SIZE, &com->q_resources_mem);
157 	if (err != 0) {
158 		device_printf(priv->dev,
159 		    "Failed to alloc queue resources for rx ring %d", i);
160 		goto abort;
161 	}
162 	com->q_resources = com->q_resources_mem.cpu_addr;
163 
164 	if (gve_is_gqi(priv))
165 		err = gve_rx_alloc_ring_gqi(priv, i);
166 	else
167 		err = gve_rx_alloc_ring_dqo(priv, i);
168 	if (err != 0)
169 		goto abort;
170 
171 	return (0);
172 
173 abort:
174 	gve_rx_free_ring(priv, i);
175 	return (err);
176 }
177 
178 int
179 gve_alloc_rx_rings(struct gve_priv *priv)
180 {
181 	int err = 0;
182 	int i;
183 
184 	priv->rx = malloc(sizeof(struct gve_rx_ring) * priv->rx_cfg.num_queues,
185 	    M_GVE, M_WAITOK | M_ZERO);
186 
187 	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
188 		err = gve_rx_alloc_ring(priv, i);
189 		if (err != 0)
190 			goto free_rings;
191 	}
192 
193 	return (0);
194 
195 free_rings:
196 	while (i--)
197 		gve_rx_free_ring(priv, i);
198 	free(priv->rx, M_GVE);
199 	return (err);
200 }
201 
202 void
203 gve_free_rx_rings(struct gve_priv *priv)
204 {
205 	int i;
206 
207 	for (i = 0; i < priv->rx_cfg.num_queues; i++)
208 		gve_rx_free_ring(priv, i);
209 
210 	free(priv->rx, M_GVE);
211 }
212 
213 static void
214 gve_rx_clear_data_ring(struct gve_rx_ring *rx)
215 {
216 	struct gve_priv *priv = rx->com.priv;
217 	int i;
218 
219 	/*
220 	 * The Rx data ring has this invariant: "the networking stack is not
221 	 * using the buffer beginning at any page_offset". This invariant is
222 	 * established initially by gve_prefill_rx_slots at alloc-time and is
223 	 * maintained by the cleanup taskqueue. This invariant implies that the
224 	 * ring can be considered to be fully posted with buffers at this point,
225 	 * even if there are unfreed mbufs still being processed, which is why we
226 	 * can fill the ring without waiting on can_flip at each slot to become true.
227 	 */
228 	for (i = 0; i < priv->rx_desc_cnt; i++) {
229 		rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i +
230 		    rx->page_info[i].page_offset);
231 		rx->fill_cnt++;
232 	}
233 
234 	bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map,
235 	    BUS_DMASYNC_PREWRITE);
236 }
237 
238 static void
239 gve_rx_clear_desc_ring(struct gve_rx_ring *rx)
240 {
241 	struct gve_priv *priv = rx->com.priv;
242 	int i;
243 
244 	for (i = 0; i < priv->rx_desc_cnt; i++)
245 		rx->desc_ring[i] = (struct gve_rx_desc){};
246 
247 	bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map,
248 	    BUS_DMASYNC_PREWRITE);
249 }
250 
251 static void
252 gve_clear_rx_ring(struct gve_priv *priv, int i)
253 {
254 	struct gve_rx_ring *rx = &priv->rx[i];
255 
256 	if (!gve_is_gqi(priv)) {
257 		gve_clear_rx_ring_dqo(priv, i);
258 		return;
259 	}
260 
261 	rx->seq_no = 1;
262 	rx->cnt = 0;
263 	rx->fill_cnt = 0;
264 	rx->mask = priv->rx_desc_cnt - 1;
265 
266 	gve_rx_clear_desc_ring(rx);
267 	gve_rx_clear_data_ring(rx);
268 }
269 
270 static void
271 gve_start_rx_ring(struct gve_priv *priv, int i)
272 {
273 	struct gve_rx_ring *rx = &priv->rx[i];
274 	struct gve_ring_com *com = &rx->com;
275 
276 	if ((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) {
277 		if (tcp_lro_init(&rx->lro) != 0)
278 			device_printf(priv->dev, "Failed to init lro for rx ring %d", i);
279 		rx->lro.ifp = priv->ifp;
280 	}
281 
282 	if (gve_is_gqi(priv))
283 		NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq, rx);
284 	else
285 		NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq_dqo, rx);
286 	com->cleanup_tq = taskqueue_create_fast("gve rx", M_WAITOK,
287 	    taskqueue_thread_enqueue, &com->cleanup_tq);
288 
289 	taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET,
290 	    "%s rxq %d", device_get_nameunit(priv->dev), i);
291 
292 	if (gve_is_gqi(priv)) {
293 		/* GQ RX bufs are prefilled at ring alloc time */
294 		gve_db_bar_write_4(priv, com->db_offset, rx->fill_cnt);
295 	} else
296 		gve_rx_prefill_buffers_dqo(rx);
297 }
298 
299 int
300 gve_create_rx_rings(struct gve_priv *priv)
301 {
302 	struct gve_ring_com *com;
303 	struct gve_rx_ring *rx;
304 	int err;
305 	int i;
306 
307 	if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK))
308 		return (0);
309 
310 	for (i = 0; i < priv->rx_cfg.num_queues; i++)
311 		gve_clear_rx_ring(priv, i);
312 
313 	err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues);
314 	if (err != 0)
315 		return (err);
316 
317 	bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map,
318 	    BUS_DMASYNC_POSTREAD);
319 
320 	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
321 		rx = &priv->rx[i];
322 		com = &rx->com;
323 
324 		com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index);
325 
326 		bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map,
327 		    BUS_DMASYNC_POSTREAD);
328 		com->db_offset = 4 * be32toh(com->q_resources->db_index);
329 		com->counter_idx = be32toh(com->q_resources->counter_index);
330 
331 		gve_start_rx_ring(priv, i);
332 	}
333 
334 	gve_set_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK);
335 	return (0);
336 }
337 
338 static void
339 gve_stop_rx_ring(struct gve_priv *priv, int i)
340 {
341 	struct gve_rx_ring *rx = &priv->rx[i];
342 	struct gve_ring_com *com = &rx->com;
343 
344 	if (com->cleanup_tq != NULL) {
345 		taskqueue_quiesce(com->cleanup_tq);
346 		taskqueue_free(com->cleanup_tq);
347 		com->cleanup_tq = NULL;
348 	}
349 
350 	tcp_lro_free(&rx->lro);
351 	rx->ctx = (struct gve_rx_ctx){};
352 }
353 
354 int
355 gve_destroy_rx_rings(struct gve_priv *priv)
356 {
357 	int err;
358 	int i;
359 
360 	for (i = 0; i < priv->rx_cfg.num_queues; i++)
361 		gve_stop_rx_ring(priv, i);
362 
363 	if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK)) {
364 		err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues);
365 		if (err != 0)
366 			return (err);
367 		gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK);
368 	}
369 
370 	return (0);
371 }
372 
373 int
374 gve_rx_intr(void *arg)
375 {
376 	struct gve_rx_ring *rx = arg;
377 	struct gve_priv *priv = rx->com.priv;
378 	struct gve_ring_com *com = &rx->com;
379 
380 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
381 		return (FILTER_STRAY);
382 
383 	gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK);
384 	taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task);
385 	return (FILTER_HANDLED);
386 }
387 
388 static inline void
389 gve_set_rss_type(__be16 flag, struct mbuf *mbuf)
390 {
391 	if ((flag & GVE_RXF_IPV4) != 0) {
392 		if ((flag & GVE_RXF_TCP) != 0)
393 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4);
394 		else if ((flag & GVE_RXF_UDP) != 0)
395 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4);
396 		else
397 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4);
398 		return;
399 	}
400 
401 	if ((flag & GVE_RXF_IPV6) != 0) {
402 		if ((flag & GVE_RXF_TCP) != 0)
403 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6);
404 		else if ((flag & GVE_RXF_UDP) != 0)
405 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6);
406 		else
407 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6);
408 		return;
409 	}
410 }
411 
412 static void
413 gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr)
414 {
415 	const __be64 offset = htobe64(GVE_DEFAULT_RX_BUFFER_OFFSET);
416 	page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET;
417 	*(slot_addr) ^= offset;
418 }
419 
420 static struct mbuf *
421 gve_rx_create_mbuf(struct gve_priv *priv, struct gve_rx_ring *rx,
422     struct gve_rx_slot_page_info *page_info, uint16_t len,
423     union gve_rx_data_slot *data_slot, bool is_only_frag)
424 {
425 	struct gve_rx_ctx *ctx = &rx->ctx;
426 	struct mbuf *mbuf;
427 	u_int ref_count;
428 	bool can_flip;
429 
430 	uint32_t offset = page_info->page_offset + page_info->pad;
431 	void *va = (char *)page_info->page_address + offset;
432 
433 	if (len <= priv->rx_copybreak && is_only_frag) {
434 		mbuf = m_get2(len, M_NOWAIT, MT_DATA, M_PKTHDR);
435 		if (__predict_false(mbuf == NULL))
436 			return (NULL);
437 
438 		m_copyback(mbuf, 0, len, va);
439 		counter_enter();
440 		counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1);
441 		counter_exit();
442 		ctx->mbuf_head = mbuf;
443 		ctx->mbuf_tail = mbuf;
444 	} else {
445 		struct mbuf *mbuf_tail = ctx->mbuf_tail;
446 		KASSERT(len <= MCLBYTES, ("gve rx fragment bigger than cluster mbuf"));
447 
448 		/*
449 		 * This page was created with VM_ALLOC_WIRED, thus the lowest
450 		 * wire count experienced by the page until the interface is
451 		 * destroyed is 1.
452 		 *
453 		 * We wire the page again before supplying an mbuf pointing to
454 		 * it to the networking stack, so before the mbuf leaves the
455 		 * driver, the wire count rises to 2.
456 		 *
457 		 * If it is 1 again, it necessarily means that the mbuf has been
458 		 * consumed and it was gve_mextadd_free that brought down the wire
459 		 * count back to 1. We only need to eventually observe the 1.
460 		 */
461 		ref_count = atomic_load_int(&page_info->page->ref_count);
462 		can_flip = VPRC_WIRE_COUNT(ref_count) == 1;
463 
464 		if (mbuf_tail == NULL) {
465 			if (can_flip)
466 				mbuf = m_gethdr(M_NOWAIT, MT_DATA);
467 			else
468 				mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
469 
470 			ctx->mbuf_head = mbuf;
471 			ctx->mbuf_tail = mbuf;
472 		} else {
473 			if (can_flip)
474 				mbuf = m_get(M_NOWAIT, MT_DATA);
475 			else
476 				mbuf = m_getcl(M_NOWAIT, MT_DATA, 0);
477 
478 			mbuf_tail->m_next = mbuf;
479 			ctx->mbuf_tail = mbuf;
480 		}
481 
482 		if (__predict_false(mbuf == NULL))
483 			return (NULL);
484 
485 		if (can_flip) {
486 			MEXTADD(mbuf, va, len, gve_mextadd_free,
487 			    page_info->page, page_info->page_address,
488 			    0, EXT_NET_DRV);
489 
490 			counter_enter();
491 			counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1);
492 			counter_exit();
493 
494 			/*
495 			 * Grab an extra ref to the page so that gve_mextadd_free
496 			 * does not end up freeing the page while the interface exists.
497 			 */
498 			vm_page_wire(page_info->page);
499 
500 			gve_rx_flip_buff(page_info, &data_slot->qpl_offset);
501 		} else {
502 			m_copyback(mbuf, 0, len, va);
503 			counter_enter();
504 			counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1);
505 			counter_exit();
506 		}
507 	}
508 
509 	mbuf->m_len = len;
510 	ctx->total_size += len;
511 
512 	return (mbuf);
513 }
514 
515 static inline bool
516 gve_needs_rss(__be16 flag)
517 {
518 	if ((flag & GVE_RXF_FRAG) != 0)
519 		return (false);
520 	if ((flag & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) != 0)
521 		return (true);
522 	return (false);
523 }
524 
525 static void
526 gve_rx(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_desc *desc,
527     uint32_t idx)
528 {
529 	struct gve_rx_slot_page_info *page_info;
530 	struct gve_dma_handle *page_dma_handle;
531 	union gve_rx_data_slot *data_slot;
532 	struct gve_rx_ctx *ctx = &rx->ctx;
533 	struct mbuf *mbuf = NULL;
534 	if_t ifp = priv->ifp;
535 	bool do_if_input;
536 	uint16_t len;
537 
538 	bool is_first_frag = ctx->frag_cnt == 0;
539 	bool is_last_frag = !(GVE_RXF_PKT_CONT & desc->flags_seq);
540 	bool is_only_frag = is_first_frag && is_last_frag;
541 
542 	if (__predict_false(ctx->drop_pkt))
543 		goto finish_frag;
544 
545 	if ((desc->flags_seq & GVE_RXF_ERR) != 0) {
546 		ctx->drop_pkt = true;
547 		counter_enter();
548 		counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1);
549 		counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1);
550 		counter_exit();
551 		m_freem(ctx->mbuf_head);
552 		goto finish_frag;
553 	}
554 
555 	page_info = &rx->page_info[idx];
556 	data_slot = &rx->data_ring[idx];
557 	page_dma_handle = &(rx->com.qpl->dmas[idx]);
558 
559 	page_info->pad = is_first_frag ? GVE_RX_PAD : 0;
560 	len = be16toh(desc->len) - page_info->pad;
561 
562 	bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map,
563 	    BUS_DMASYNC_POSTREAD);
564 
565 	mbuf = gve_rx_create_mbuf(priv, rx, page_info, len, data_slot,
566 	    is_only_frag);
567 	if (mbuf == NULL) {
568 		ctx->drop_pkt = true;
569 		counter_enter();
570 		counter_u64_add_protected(rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1);
571 		counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1);
572 		counter_exit();
573 		m_freem(ctx->mbuf_head);
574 		goto finish_frag;
575 	}
576 
577 	if (is_first_frag) {
578 		mbuf->m_pkthdr.rcvif = priv->ifp;
579 		ctx->is_tcp = desc->flags_seq & GVE_RXF_TCP;
580 
581 		if (gve_needs_rss(desc->flags_seq)) {
582 			gve_set_rss_type(desc->flags_seq, mbuf);
583 			mbuf->m_pkthdr.flowid = be32toh(desc->rss_hash);
584 		}
585 
586 		if ((desc->csum != 0) && ((desc->flags_seq & GVE_RXF_FRAG) == 0)) {
587 			mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
588 				                    CSUM_IP_VALID |
589 						    CSUM_DATA_VALID |
590 						    CSUM_PSEUDO_HDR;
591 			mbuf->m_pkthdr.csum_data = 0xffff;
592 		}
593 	}
594 
595 	if (is_last_frag) {
596 		mbuf = ctx->mbuf_head;
597 		mbuf->m_pkthdr.len = ctx->total_size;
598 		do_if_input = true;
599 
600 		if (((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) &&      /* LRO is enabled */
601 		    (ctx->is_tcp) &&                      		    /* pkt is a TCP pkt */
602 		    ((mbuf->m_pkthdr.csum_flags & CSUM_DATA_VALID) != 0) && /* NIC verified csum */
603 		    (rx->lro.lro_cnt != 0) &&                               /* LRO resources exist */
604 		    (tcp_lro_rx(&rx->lro, mbuf, 0) == 0))
605 			do_if_input = false;
606 
607 		if (do_if_input)
608 			if_input(ifp, mbuf);
609 
610 		counter_enter();
611 		counter_u64_add_protected(rx->stats.rbytes, ctx->total_size);
612 		counter_u64_add_protected(rx->stats.rpackets, 1);
613 		counter_exit();
614 	}
615 
616 finish_frag:
617 	ctx->frag_cnt++;
618 	if (is_last_frag)
619 		rx->ctx = (struct gve_rx_ctx){};
620 }
621 
622 static bool
623 gve_rx_work_pending(struct gve_rx_ring *rx)
624 {
625 	struct gve_rx_desc *desc;
626 	__be16 flags_seq;
627 	uint32_t next_idx;
628 
629 	next_idx = rx->cnt & rx->mask;
630 	desc = rx->desc_ring + next_idx;
631 
632 	flags_seq = desc->flags_seq;
633 
634 	return (GVE_SEQNO(flags_seq) == rx->seq_no);
635 }
636 
637 static inline uint8_t
638 gve_next_seqno(uint8_t seq)
639 {
640 	return ((seq + 1) == 8 ? 1 : seq + 1);
641 }
642 
643 static void
644 gve_rx_cleanup(struct gve_priv *priv, struct gve_rx_ring *rx, int budget)
645 {
646 	uint32_t idx = rx->cnt & rx->mask;
647 	struct gve_rx_desc *desc;
648 	struct gve_rx_ctx *ctx = &rx->ctx;
649 	uint32_t work_done = 0;
650 
651 	NET_EPOCH_ASSERT();
652 
653 	bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map,
654 	    BUS_DMASYNC_POSTREAD);
655 	desc = &rx->desc_ring[idx];
656 
657 	while ((work_done < budget || ctx->frag_cnt) &&
658 	    (GVE_SEQNO(desc->flags_seq) == rx->seq_no)) {
659 
660 		gve_rx(priv, rx, desc, idx);
661 
662 		rx->cnt++;
663 		idx = rx->cnt & rx->mask;
664 		desc = &rx->desc_ring[idx];
665 		rx->seq_no = gve_next_seqno(rx->seq_no);
666 		work_done++;
667 	}
668 
669 	/* The device will only send whole packets. */
670 	if (__predict_false(ctx->frag_cnt)) {
671 		m_freem(ctx->mbuf_head);
672 		rx->ctx = (struct gve_rx_ctx){};
673 		device_printf(priv->dev,
674 		    "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset",
675 		    GVE_SEQNO(desc->flags_seq), rx->seq_no);
676 		gve_schedule_reset(priv);
677 	}
678 
679 	if (work_done != 0)
680 		tcp_lro_flush_all(&rx->lro);
681 
682 	bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map,
683 	    BUS_DMASYNC_PREWRITE);
684 
685 	/* Buffers are refilled as the descs are processed */
686 	rx->fill_cnt += work_done;
687 	gve_db_bar_write_4(priv, rx->com.db_offset, rx->fill_cnt);
688 }
689 
690 void
691 gve_rx_cleanup_tq(void *arg, int pending)
692 {
693 	struct gve_rx_ring *rx = arg;
694 	struct gve_priv *priv = rx->com.priv;
695 
696 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
697 		return;
698 
699 	gve_rx_cleanup(priv, rx, /*budget=*/128);
700 
701 	gve_db_bar_write_4(priv, rx->com.irq_db_offset,
702 	    GVE_IRQ_ACK | GVE_IRQ_EVENT);
703 
704 	/*
705 	 * Fragments received before this barrier MAY NOT cause the NIC to send an
706 	 * interrupt but they will still be handled by the enqueue below.
707 	 * Fragments received after the barrier WILL trigger an interrupt.
708 	 */
709 	mb();
710 
711 	if (gve_rx_work_pending(rx)) {
712 		gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK);
713 		taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task);
714 	}
715 }
716