xref: /freebsd/sys/dev/gve/gve_rx_dqo.c (revision c27f7d6b9cf6d4ab01cb3d0972726c14e0aca146)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2024 Google LLC
5  *
6  * Redistribution and use in source and binary forms, with or without modification,
7  * are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * 3. Neither the name of the copyright holder nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software without
18  *    specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
24  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
27  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 #include "gve.h"
32 #include "gve_adminq.h"
33 #include "gve_dqo.h"
34 
35 static void
36 gve_free_rx_mbufs_dqo(struct gve_rx_ring *rx)
37 {
38 	struct gve_rx_buf_dqo *buf;
39 	int i;
40 
41 	if (gve_is_qpl(rx->com.priv))
42 		return;
43 
44 	for (i = 0; i < rx->dqo.buf_cnt; i++) {
45 		buf = &rx->dqo.bufs[i];
46 		if (!buf->mbuf)
47 			continue;
48 
49 		bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap,
50 		    BUS_DMASYNC_POSTREAD);
51 		bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap);
52 		m_freem(buf->mbuf);
53 		buf->mbuf = NULL;
54 	}
55 }
56 
57 void
58 gve_rx_free_ring_dqo(struct gve_priv *priv, int i)
59 {
60 	struct gve_rx_ring *rx = &priv->rx[i];
61 	struct gve_ring_com *com = &rx->com;
62 	int j;
63 
64 	if (rx->dqo.compl_ring != NULL) {
65 		gve_dma_free_coherent(&rx->dqo.compl_ring_mem);
66 		rx->dqo.compl_ring = NULL;
67 	}
68 
69 	if (rx->dqo.desc_ring != NULL) {
70 		gve_dma_free_coherent(&rx->desc_ring_mem);
71 		rx->dqo.desc_ring = NULL;
72 	}
73 
74 	if (rx->dqo.bufs != NULL) {
75 		gve_free_rx_mbufs_dqo(rx);
76 
77 		if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) {
78 			for (j = 0; j < rx->dqo.buf_cnt; j++)
79 				if (rx->dqo.bufs[j].mapped)
80 					bus_dmamap_destroy(rx->dqo.buf_dmatag,
81 					    rx->dqo.bufs[j].dmamap);
82 		}
83 
84 		free(rx->dqo.bufs, M_GVE);
85 		rx->dqo.bufs = NULL;
86 	}
87 
88 	if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag)
89 		bus_dma_tag_destroy(rx->dqo.buf_dmatag);
90 
91 	if (com->qpl != NULL) {
92 		gve_free_qpl(priv, com->qpl);
93 		com->qpl = NULL;
94 	}
95 }
96 
97 int
98 gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i)
99 {
100 	struct gve_rx_ring *rx = &priv->rx[i];
101 	int err;
102 	int j;
103 
104 	err = gve_dma_alloc_coherent(priv,
105 	    sizeof(struct gve_rx_desc_dqo) * priv->rx_desc_cnt,
106 	    CACHE_LINE_SIZE, &rx->desc_ring_mem);
107 	if (err != 0) {
108 		device_printf(priv->dev,
109 		    "Failed to alloc desc ring for rx ring %d", i);
110 		goto abort;
111 	}
112 	rx->dqo.desc_ring = rx->desc_ring_mem.cpu_addr;
113 	rx->dqo.mask = priv->rx_desc_cnt - 1;
114 
115 	err = gve_dma_alloc_coherent(priv,
116 	    sizeof(struct gve_rx_compl_desc_dqo) * priv->rx_desc_cnt,
117 	    CACHE_LINE_SIZE, &rx->dqo.compl_ring_mem);
118 	if (err != 0) {
119 		device_printf(priv->dev,
120 		    "Failed to alloc compl ring for rx ring %d", i);
121 		goto abort;
122 	}
123 	rx->dqo.compl_ring = rx->dqo.compl_ring_mem.cpu_addr;
124 	rx->dqo.mask = priv->rx_desc_cnt - 1;
125 
126 	rx->dqo.buf_cnt = gve_is_qpl(priv) ? GVE_RX_NUM_QPL_PAGES_DQO :
127 	    priv->rx_desc_cnt;
128 	rx->dqo.bufs = malloc(rx->dqo.buf_cnt * sizeof(struct gve_rx_buf_dqo),
129 	    M_GVE, M_WAITOK | M_ZERO);
130 
131 	if (gve_is_qpl(priv)) {
132 		rx->com.qpl = gve_alloc_qpl(priv, i + priv->tx_cfg.max_queues,
133 		    GVE_RX_NUM_QPL_PAGES_DQO, /*single_kva=*/false);
134 		if (rx->com.qpl == NULL) {
135 			device_printf(priv->dev,
136 			    "Failed to alloc QPL for rx ring %d", i);
137 			err = ENOMEM;
138 			goto abort;
139 		}
140 		return (0);
141 	}
142 
143 	err = bus_dma_tag_create(
144 	    bus_get_dma_tag(priv->dev),	/* parent */
145 	    1, 0,			/* alignment, bounds */
146 	    BUS_SPACE_MAXADDR,		/* lowaddr */
147 	    BUS_SPACE_MAXADDR,		/* highaddr */
148 	    NULL, NULL,			/* filter, filterarg */
149 	    MCLBYTES,			/* maxsize */
150 	    1,				/* nsegments */
151 	    MCLBYTES,			/* maxsegsize */
152 	    0,				/* flags */
153 	    NULL,			/* lockfunc */
154 	    NULL,			/* lockarg */
155 	    &rx->dqo.buf_dmatag);
156 	if (err != 0) {
157 		device_printf(priv->dev,
158 		    "%s: bus_dma_tag_create failed: %d\n",
159 		    __func__, err);
160 		goto abort;
161 	}
162 
163 	for (j = 0; j < rx->dqo.buf_cnt; j++) {
164 		err = bus_dmamap_create(rx->dqo.buf_dmatag, 0,
165 		    &rx->dqo.bufs[j].dmamap);
166 		if (err != 0) {
167 			device_printf(priv->dev,
168 			    "err in creating rx buf dmamap %d: %d",
169 			    j, err);
170 			goto abort;
171 		}
172 		rx->dqo.bufs[j].mapped = true;
173 	}
174 
175 	return (0);
176 
177 abort:
178 	gve_rx_free_ring_dqo(priv, i);
179 	return (err);
180 }
181 
182 static void
183 gve_rx_clear_desc_ring_dqo(struct gve_rx_ring *rx)
184 {
185 	struct gve_ring_com *com = &rx->com;
186 	int entries;
187 	int i;
188 
189 	entries = com->priv->rx_desc_cnt;
190 	for (i = 0; i < entries; i++)
191 		rx->dqo.desc_ring[i] = (struct gve_rx_desc_dqo){};
192 
193 	bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map,
194 	    BUS_DMASYNC_PREWRITE);
195 }
196 
197 static void
198 gve_rx_clear_compl_ring_dqo(struct gve_rx_ring *rx)
199 {
200 	struct gve_ring_com *com = &rx->com;
201 	int i;
202 
203 	for (i = 0; i < com->priv->rx_desc_cnt; i++)
204 		rx->dqo.compl_ring[i] = (struct gve_rx_compl_desc_dqo){};
205 
206 	bus_dmamap_sync(rx->dqo.compl_ring_mem.tag, rx->dqo.compl_ring_mem.map,
207 	    BUS_DMASYNC_PREWRITE);
208 }
209 
210 void
211 gve_clear_rx_ring_dqo(struct gve_priv *priv, int i)
212 {
213 	struct gve_rx_ring *rx = &priv->rx[i];
214 	int j;
215 
216 	rx->fill_cnt = 0;
217 	rx->cnt = 0;
218 	rx->dqo.mask = priv->rx_desc_cnt - 1;
219 	rx->dqo.head = 0;
220 	rx->dqo.tail = 0;
221 	rx->dqo.cur_gen_bit = 0;
222 
223 	gve_rx_clear_desc_ring_dqo(rx);
224 	gve_rx_clear_compl_ring_dqo(rx);
225 
226 	gve_free_rx_mbufs_dqo(rx);
227 
228 	if (gve_is_qpl(priv)) {
229 		SLIST_INIT(&rx->dqo.free_bufs);
230 		STAILQ_INIT(&rx->dqo.used_bufs);
231 
232 		for (j = 0; j < rx->dqo.buf_cnt; j++) {
233 			struct gve_rx_buf_dqo *buf = &rx->dqo.bufs[j];
234 
235 			vm_page_t page = rx->com.qpl->pages[buf - rx->dqo.bufs];
236 			u_int ref_count = atomic_load_int(&page->ref_count);
237 
238 			/*
239 			 * An ifconfig down+up might see pages still in flight
240 			 * from the previous innings.
241 			 */
242 			if (VPRC_WIRE_COUNT(ref_count) == 1)
243 				SLIST_INSERT_HEAD(&rx->dqo.free_bufs,
244 				    buf, slist_entry);
245 			else
246 				STAILQ_INSERT_TAIL(&rx->dqo.used_bufs,
247 				    buf, stailq_entry);
248 
249 			buf->num_nic_frags = 0;
250 			buf->next_idx = 0;
251 		}
252 	} else {
253 		SLIST_INIT(&rx->dqo.free_bufs);
254 		for (j = 0; j < rx->dqo.buf_cnt; j++)
255 			SLIST_INSERT_HEAD(&rx->dqo.free_bufs,
256 			    &rx->dqo.bufs[j], slist_entry);
257 	}
258 }
259 
260 int
261 gve_rx_intr_dqo(void *arg)
262 {
263 	struct gve_rx_ring *rx = arg;
264 	struct gve_priv *priv = rx->com.priv;
265 	struct gve_ring_com *com = &rx->com;
266 
267 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
268 		return (FILTER_STRAY);
269 
270 	/* Interrupts are automatically masked */
271 	taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
272 	return (FILTER_HANDLED);
273 }
274 
275 static void
276 gve_rx_advance_head_dqo(struct gve_rx_ring *rx)
277 {
278 	rx->dqo.head = (rx->dqo.head + 1) & rx->dqo.mask;
279 	rx->fill_cnt++; /* rx->fill_cnt is just a sysctl counter */
280 
281 	if ((rx->dqo.head & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) {
282 		bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map,
283 		    BUS_DMASYNC_PREWRITE);
284 		gve_db_bar_dqo_write_4(rx->com.priv, rx->com.db_offset,
285 		    rx->dqo.head);
286 	}
287 }
288 
289 static void
290 gve_rx_post_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf)
291 {
292 	struct gve_rx_desc_dqo *desc;
293 
294 	bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap,
295 	    BUS_DMASYNC_PREREAD);
296 
297 	desc = &rx->dqo.desc_ring[rx->dqo.head];
298 	desc->buf_id = htole16(buf - rx->dqo.bufs);
299 	desc->buf_addr = htole64(buf->addr);
300 
301 	gve_rx_advance_head_dqo(rx);
302 }
303 
304 static int
305 gve_rx_post_new_mbuf_dqo(struct gve_rx_ring *rx, int how)
306 {
307 	struct gve_rx_buf_dqo *buf;
308 	bus_dma_segment_t segs[1];
309 	int nsegs;
310 	int err;
311 
312 	buf = SLIST_FIRST(&rx->dqo.free_bufs);
313 	if (__predict_false(!buf)) {
314 		device_printf(rx->com.priv->dev,
315 		    "Unexpected empty free bufs list\n");
316 		return (ENOBUFS);
317 	}
318 	SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry);
319 
320 	buf->mbuf = m_getcl(how, MT_DATA, M_PKTHDR);
321 	if (__predict_false(!buf->mbuf)) {
322 		err = ENOMEM;
323 		counter_enter();
324 		counter_u64_add_protected(rx->stats.rx_mbuf_mclget_null, 1);
325 		counter_exit();
326 		goto abort_with_buf;
327 	}
328 	buf->mbuf->m_len = MCLBYTES;
329 
330 	err = bus_dmamap_load_mbuf_sg(rx->dqo.buf_dmatag, buf->dmamap,
331 	    buf->mbuf, segs, &nsegs, BUS_DMA_NOWAIT);
332 	KASSERT(nsegs == 1, ("dma segs for a cluster mbuf is not 1"));
333 	if (__predict_false(err != 0)) {
334 		counter_enter();
335 		counter_u64_add_protected(rx->stats.rx_mbuf_dmamap_err, 1);
336 		counter_exit();
337 		goto abort_with_mbuf;
338 	}
339 	buf->addr = segs[0].ds_addr;
340 
341 	gve_rx_post_buf_dqo(rx, buf);
342 	return (0);
343 
344 abort_with_mbuf:
345 	m_freem(buf->mbuf);
346 	buf->mbuf = NULL;
347 abort_with_buf:
348 	SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry);
349 	return (err);
350 }
351 
352 static struct gve_dma_handle *
353 gve_get_page_dma_handle(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf)
354 {
355 	return (&(rx->com.qpl->dmas[buf - rx->dqo.bufs]));
356 }
357 
358 static void
359 gve_rx_post_qpl_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf,
360     uint8_t frag_num)
361 {
362 	struct gve_rx_desc_dqo *desc = &rx->dqo.desc_ring[rx->dqo.head];
363 	union gve_rx_qpl_buf_id_dqo composed_id;
364 	struct gve_dma_handle *page_dma_handle;
365 
366 	composed_id.buf_id = buf - rx->dqo.bufs;
367 	composed_id.frag_num = frag_num;
368 	desc->buf_id = htole16(composed_id.all);
369 
370 	page_dma_handle = gve_get_page_dma_handle(rx, buf);
371 	bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map,
372 	    BUS_DMASYNC_PREREAD);
373 	desc->buf_addr = htole64(page_dma_handle->bus_addr +
374 	    frag_num * GVE_DEFAULT_RX_BUFFER_SIZE);
375 
376 	buf->num_nic_frags++;
377 	gve_rx_advance_head_dqo(rx);
378 }
379 
380 static void
381 gve_rx_maybe_extract_from_used_bufs(struct gve_rx_ring *rx, bool just_one)
382 {
383 	struct gve_rx_buf_dqo *hol_blocker = NULL;
384 	struct gve_rx_buf_dqo *buf;
385 	u_int ref_count;
386 	vm_page_t page;
387 
388 	while (true) {
389 		buf = STAILQ_FIRST(&rx->dqo.used_bufs);
390 		if (__predict_false(buf == NULL))
391 			break;
392 
393 		page = rx->com.qpl->pages[buf - rx->dqo.bufs];
394 		ref_count = atomic_load_int(&page->ref_count);
395 
396 		if (VPRC_WIRE_COUNT(ref_count) != 1) {
397 			/* Account for one head-of-line blocker */
398 			if (hol_blocker != NULL)
399 				break;
400 			hol_blocker = buf;
401 			STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs,
402 			    stailq_entry);
403 			continue;
404 		}
405 
406 		STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs,
407 		    stailq_entry);
408 		SLIST_INSERT_HEAD(&rx->dqo.free_bufs,
409 		    buf, slist_entry);
410 		if (just_one)
411 			break;
412 	}
413 
414 	if (hol_blocker != NULL)
415 		STAILQ_INSERT_HEAD(&rx->dqo.used_bufs,
416 		    hol_blocker, stailq_entry);
417 }
418 
419 static int
420 gve_rx_post_new_dqo_qpl_buf(struct gve_rx_ring *rx)
421 {
422 	struct gve_rx_buf_dqo *buf;
423 
424 	buf = SLIST_FIRST(&rx->dqo.free_bufs);
425 	if (__predict_false(buf == NULL)) {
426 		gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/true);
427 		buf = SLIST_FIRST(&rx->dqo.free_bufs);
428 		if (__predict_false(buf == NULL))
429 			return (ENOBUFS);
430 	}
431 
432 	gve_rx_post_qpl_buf_dqo(rx, buf, buf->next_idx);
433 	if (buf->next_idx == GVE_DQ_NUM_FRAGS_IN_PAGE - 1)
434 		buf->next_idx = 0;
435 	else
436 		buf->next_idx++;
437 
438 	/*
439 	 * We have posted all the frags in this buf to the NIC.
440 	 * - buf will enter used_bufs once the last completion arrives.
441 	 * - It will renter free_bufs in gve_rx_maybe_extract_from_used_bufs
442 	 *   when its wire count drops back to 1.
443 	 */
444 	if (buf->next_idx == 0)
445 		SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry);
446 	return (0);
447 }
448 
449 static void
450 gve_rx_post_buffers_dqo(struct gve_rx_ring *rx, int how)
451 {
452 	uint32_t num_pending_bufs;
453 	uint32_t num_to_post;
454 	uint32_t i;
455 	int err;
456 
457 	num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask;
458 	num_to_post = rx->dqo.mask - num_pending_bufs;
459 
460 	for (i = 0; i < num_to_post; i++) {
461 		if (gve_is_qpl(rx->com.priv))
462 			err = gve_rx_post_new_dqo_qpl_buf(rx);
463 		else
464 			err = gve_rx_post_new_mbuf_dqo(rx, how);
465 		if (err)
466 			break;
467 	}
468 }
469 
470 void
471 gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx)
472 {
473 	gve_rx_post_buffers_dqo(rx, M_WAITOK);
474 }
475 
476 static void
477 gve_rx_set_hashtype_dqo(struct mbuf *mbuf, struct gve_ptype *ptype, bool *is_tcp)
478 {
479 	switch (ptype->l3_type) {
480 	case GVE_L3_TYPE_IPV4:
481 		switch (ptype->l4_type) {
482 		case GVE_L4_TYPE_TCP:
483 			*is_tcp = true;
484 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4);
485 			break;
486 		case GVE_L4_TYPE_UDP:
487 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4);
488 			break;
489 		default:
490 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4);
491 		}
492 		break;
493 	case GVE_L3_TYPE_IPV6:
494 		switch (ptype->l4_type) {
495 		case GVE_L4_TYPE_TCP:
496 			*is_tcp = true;
497 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6);
498 			break;
499 		case GVE_L4_TYPE_UDP:
500 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6);
501 			break;
502 		default:
503 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6);
504 		}
505 		break;
506 	default:
507 		M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE_HASH);
508 	}
509 }
510 
511 static void
512 gve_rx_set_csum_flags_dqo(struct mbuf *mbuf,
513     struct gve_rx_compl_desc_dqo *desc,
514     struct gve_ptype *ptype)
515 {
516 	/* HW did not identify and process L3 and L4 headers. */
517 	if (__predict_false(!desc->l3_l4_processed))
518 		return;
519 
520 	if (ptype->l3_type == GVE_L3_TYPE_IPV4) {
521 		if (__predict_false(desc->csum_ip_err ||
522 		    desc->csum_external_ip_err))
523 			return;
524 	} else if (ptype->l3_type == GVE_L3_TYPE_IPV6) {
525 		/* Checksum should be skipped if this flag is set. */
526 		if (__predict_false(desc->ipv6_ex_add))
527 			return;
528 	}
529 
530 	if (__predict_false(desc->csum_l4_err))
531 		return;
532 
533 	switch (ptype->l4_type) {
534 	case GVE_L4_TYPE_TCP:
535 	case GVE_L4_TYPE_UDP:
536 	case GVE_L4_TYPE_ICMP:
537 	case GVE_L4_TYPE_SCTP:
538 		mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
539 					    CSUM_IP_VALID |
540 					    CSUM_DATA_VALID |
541 					    CSUM_PSEUDO_HDR;
542 		mbuf->m_pkthdr.csum_data = 0xffff;
543 		break;
544 	default:
545 		break;
546 	}
547 }
548 
549 static void
550 gve_rx_input_mbuf_dqo(struct gve_rx_ring *rx,
551     struct gve_rx_compl_desc_dqo *compl_desc)
552 {
553 	struct mbuf *mbuf = rx->ctx.mbuf_head;
554 	if_t ifp = rx->com.priv->ifp;
555 	struct gve_ptype *ptype;
556 	bool do_if_input = true;
557 	bool is_tcp = false;
558 
559 	ptype = &rx->com.priv->ptype_lut_dqo->ptypes[compl_desc->packet_type];
560 	gve_rx_set_hashtype_dqo(mbuf, ptype, &is_tcp);
561 	mbuf->m_pkthdr.flowid = le32toh(compl_desc->hash);
562 	gve_rx_set_csum_flags_dqo(mbuf, compl_desc, ptype);
563 
564 	mbuf->m_pkthdr.rcvif = ifp;
565 	mbuf->m_pkthdr.len = rx->ctx.total_size;
566 
567 	if (((if_getcapenable(rx->com.priv->ifp) & IFCAP_LRO) != 0) &&
568 	    is_tcp &&
569 	    (rx->lro.lro_cnt != 0) &&
570 	    (tcp_lro_rx(&rx->lro, mbuf, 0) == 0))
571 		do_if_input = false;
572 
573 	if (do_if_input)
574 		if_input(ifp, mbuf);
575 
576 	counter_enter();
577 	counter_u64_add_protected(rx->stats.rbytes, rx->ctx.total_size);
578 	counter_u64_add_protected(rx->stats.rpackets, 1);
579 	counter_exit();
580 
581 	rx->ctx = (struct gve_rx_ctx){};
582 }
583 
584 static int
585 gve_rx_copybreak_dqo(struct gve_rx_ring *rx, void *va,
586     struct gve_rx_compl_desc_dqo *compl_desc, uint16_t frag_len)
587 {
588 	struct mbuf *mbuf;
589 
590 	mbuf = m_get2(frag_len, M_NOWAIT, MT_DATA, M_PKTHDR);
591 	if (__predict_false(mbuf == NULL))
592 		return (ENOMEM);
593 
594 	counter_enter();
595 	counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1);
596 	counter_exit();
597 
598 	m_copyback(mbuf, 0, frag_len, va);
599 	mbuf->m_len = frag_len;
600 
601 	rx->ctx.mbuf_head = mbuf;
602 	rx->ctx.mbuf_tail = mbuf;
603 	rx->ctx.total_size += frag_len;
604 
605 	gve_rx_input_mbuf_dqo(rx, compl_desc);
606 	return (0);
607 }
608 
609 static void
610 gve_rx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
611     struct gve_rx_compl_desc_dqo *compl_desc,
612     int *work_done)
613 {
614 	bool is_last_frag = compl_desc->end_of_packet != 0;
615 	struct gve_rx_ctx *ctx = &rx->ctx;
616 	struct gve_rx_buf_dqo *buf;
617 	uint32_t num_pending_bufs;
618 	uint16_t frag_len;
619 	uint16_t buf_id;
620 	int err;
621 
622 	buf_id = le16toh(compl_desc->buf_id);
623 	if (__predict_false(buf_id >= rx->dqo.buf_cnt)) {
624 		device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n",
625 		    buf_id, rx->com.id);
626 		gve_schedule_reset(priv);
627 		goto drop_frag_clear_ctx;
628 	}
629 	buf = &rx->dqo.bufs[buf_id];
630 	if (__predict_false(buf->mbuf == NULL)) {
631 		device_printf(priv->dev, "Spurious completion for buf id %d on rxq %d, issuing reset\n",
632 		    buf_id, rx->com.id);
633 		gve_schedule_reset(priv);
634 		goto drop_frag_clear_ctx;
635 	}
636 
637 	if (__predict_false(ctx->drop_pkt))
638 		goto drop_frag;
639 
640 	if (__predict_false(compl_desc->rx_error)) {
641 		counter_enter();
642 		counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1);
643 		counter_exit();
644 		goto drop_frag;
645 	}
646 
647 	bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap,
648 	    BUS_DMASYNC_POSTREAD);
649 
650 	frag_len = compl_desc->packet_len;
651 	if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) {
652 		err = gve_rx_copybreak_dqo(rx, mtod(buf->mbuf, char*),
653 		    compl_desc, frag_len);
654 		if (__predict_false(err != 0))
655 			goto drop_frag;
656 		(*work_done)++;
657 		gve_rx_post_buf_dqo(rx, buf);
658 		return;
659 	}
660 
661 	/*
662 	 * Although buffer completions may arrive out of order, buffer
663 	 * descriptors are consumed by the NIC in order. That is, the
664 	 * buffer at desc_ring[tail] might not be the buffer we got the
665 	 * completion compl_ring[tail] for: but we know that desc_ring[tail]
666 	 * has already been read by the NIC.
667 	 */
668 	num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask;
669 
670 	/*
671 	 * For every fragment received, try to post a new buffer.
672 	 *
673 	 * Failures are okay but only so long as the number of outstanding
674 	 * buffers is above a threshold.
675 	 *
676 	 * Beyond that we drop new packets to reuse their buffers.
677 	 * Without ensuring a minimum number of buffers for the NIC to
678 	 * put packets in, we run the risk of getting the queue stuck
679 	 * for good.
680 	 */
681 	err = gve_rx_post_new_mbuf_dqo(rx, M_NOWAIT);
682 	if (__predict_false(err != 0 &&
683 	    num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) {
684 		counter_enter();
685 		counter_u64_add_protected(
686 		    rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1);
687 		counter_exit();
688 		goto drop_frag;
689 	}
690 
691 	buf->mbuf->m_len = frag_len;
692 	ctx->total_size += frag_len;
693 	if (ctx->mbuf_tail == NULL) {
694 		ctx->mbuf_head = buf->mbuf;
695 		ctx->mbuf_tail = buf->mbuf;
696 	} else {
697 		buf->mbuf->m_flags &= ~M_PKTHDR;
698 		ctx->mbuf_tail->m_next = buf->mbuf;
699 		ctx->mbuf_tail = buf->mbuf;
700 	}
701 
702 	/*
703 	 * Disassociate the mbuf from buf and surrender buf to the free list to
704 	 * be used by a future mbuf.
705 	 */
706 	bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap);
707 	buf->mbuf = NULL;
708 	buf->addr = 0;
709 	SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry);
710 
711 	if (is_last_frag) {
712 		gve_rx_input_mbuf_dqo(rx, compl_desc);
713 		(*work_done)++;
714 	}
715 	return;
716 
717 drop_frag:
718 	/* Clear the earlier frags if there were any */
719 	m_freem(ctx->mbuf_head);
720 	rx->ctx = (struct gve_rx_ctx){};
721 	/* Drop the rest of the pkt if there are more frags */
722 	ctx->drop_pkt = true;
723 	/* Reuse the dropped frag's buffer */
724 	gve_rx_post_buf_dqo(rx, buf);
725 
726 	if (is_last_frag)
727 		goto drop_frag_clear_ctx;
728 	return;
729 
730 drop_frag_clear_ctx:
731 	counter_enter();
732 	counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1);
733 	counter_exit();
734 	m_freem(ctx->mbuf_head);
735 	rx->ctx = (struct gve_rx_ctx){};
736 }
737 
738 static void *
739 gve_get_cpu_addr_for_qpl_buf(struct gve_rx_ring *rx,
740     struct gve_rx_buf_dqo *buf, uint8_t buf_frag_num)
741 {
742 	int page_idx = buf - rx->dqo.bufs;
743 	void *va = rx->com.qpl->dmas[page_idx].cpu_addr;
744 
745 	va = (char *)va + (buf_frag_num * GVE_DEFAULT_RX_BUFFER_SIZE);
746 	return (va);
747 }
748 
749 static int
750 gve_rx_add_clmbuf_to_ctx(struct gve_rx_ring *rx,
751     struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf,
752     uint8_t buf_frag_num, uint16_t frag_len)
753 {
754 	void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num);
755 	struct mbuf *mbuf;
756 
757 	if (ctx->mbuf_tail == NULL) {
758 		mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
759 		if (mbuf == NULL)
760 			return (ENOMEM);
761 		ctx->mbuf_head = mbuf;
762 		ctx->mbuf_tail = mbuf;
763 	} else {
764 		mbuf = m_getcl(M_NOWAIT, MT_DATA, 0);
765 		if (mbuf == NULL)
766 			return (ENOMEM);
767 		ctx->mbuf_tail->m_next = mbuf;
768 		ctx->mbuf_tail = mbuf;
769 	}
770 
771 	mbuf->m_len = frag_len;
772 	ctx->total_size += frag_len;
773 
774 	m_copyback(mbuf, 0, frag_len, va);
775 	counter_enter();
776 	counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1);
777 	counter_exit();
778 	return (0);
779 }
780 
781 static int
782 gve_rx_add_extmbuf_to_ctx(struct gve_rx_ring *rx,
783     struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf,
784     uint8_t buf_frag_num, uint16_t frag_len)
785 {
786 	struct mbuf *mbuf;
787 	void *page_addr;
788 	vm_page_t page;
789 	int page_idx;
790 	void *va;
791 
792 	if (ctx->mbuf_tail == NULL) {
793 		mbuf = m_gethdr(M_NOWAIT, MT_DATA);
794 		if (mbuf == NULL)
795 			return (ENOMEM);
796 		ctx->mbuf_head = mbuf;
797 		ctx->mbuf_tail = mbuf;
798 	} else {
799 		mbuf = m_get(M_NOWAIT, MT_DATA);
800 		if (mbuf == NULL)
801 			return (ENOMEM);
802 		ctx->mbuf_tail->m_next = mbuf;
803 		ctx->mbuf_tail = mbuf;
804 	}
805 
806 	mbuf->m_len = frag_len;
807 	ctx->total_size += frag_len;
808 
809 	page_idx = buf - rx->dqo.bufs;
810 	page = rx->com.qpl->pages[page_idx];
811 	page_addr = rx->com.qpl->dmas[page_idx].cpu_addr;
812 	va = (char *)page_addr + (buf_frag_num * GVE_DEFAULT_RX_BUFFER_SIZE);
813 
814 	/*
815 	 * Grab an extra ref to the page so that gve_mextadd_free
816 	 * does not end up freeing the page while the interface exists.
817 	 */
818 	vm_page_wire(page);
819 
820 	counter_enter();
821 	counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1);
822 	counter_exit();
823 
824 	MEXTADD(mbuf, va, frag_len,
825 	    gve_mextadd_free, page, page_addr,
826 	    0, EXT_NET_DRV);
827 	return (0);
828 }
829 
830 static void
831 gve_rx_dqo_qpl(struct gve_priv *priv, struct gve_rx_ring *rx,
832     struct gve_rx_compl_desc_dqo *compl_desc,
833     int *work_done)
834 {
835 	bool is_last_frag = compl_desc->end_of_packet != 0;
836 	union gve_rx_qpl_buf_id_dqo composed_id;
837 	struct gve_dma_handle *page_dma_handle;
838 	struct gve_rx_ctx *ctx = &rx->ctx;
839 	struct gve_rx_buf_dqo *buf;
840 	uint32_t num_pending_bufs;
841 	uint8_t buf_frag_num;
842 	uint16_t frag_len;
843 	uint16_t buf_id;
844 	int err;
845 
846 	composed_id.all = le16toh(compl_desc->buf_id);
847 	buf_id = composed_id.buf_id;
848 	buf_frag_num = composed_id.frag_num;
849 
850 	if (__predict_false(buf_id >= rx->dqo.buf_cnt)) {
851 		device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n",
852 		    buf_id, rx->com.id);
853 		gve_schedule_reset(priv);
854 		goto drop_frag_clear_ctx;
855 	}
856 	buf = &rx->dqo.bufs[buf_id];
857 	if (__predict_false(buf->num_nic_frags == 0 ||
858 	    buf_frag_num > GVE_DQ_NUM_FRAGS_IN_PAGE - 1)) {
859 		device_printf(priv->dev, "Spurious compl for buf id %d on rxq %d "
860 		    "with buf_frag_num %d and num_nic_frags %d, issuing reset\n",
861 		    buf_id, rx->com.id, buf_frag_num, buf->num_nic_frags);
862 		gve_schedule_reset(priv);
863 		goto drop_frag_clear_ctx;
864 	}
865 
866 	buf->num_nic_frags--;
867 
868 	if (__predict_false(ctx->drop_pkt))
869 		goto drop_frag;
870 
871 	if (__predict_false(compl_desc->rx_error)) {
872 		counter_enter();
873 		counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1);
874 		counter_exit();
875 		goto drop_frag;
876 	}
877 
878 	page_dma_handle = gve_get_page_dma_handle(rx, buf);
879 	bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map,
880 	    BUS_DMASYNC_POSTREAD);
881 
882 	frag_len = compl_desc->packet_len;
883 	if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) {
884 		void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num);
885 
886 		err = gve_rx_copybreak_dqo(rx, va, compl_desc, frag_len);
887 		if (__predict_false(err != 0))
888 			goto drop_frag;
889 		(*work_done)++;
890 		gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num);
891 		return;
892 	}
893 
894 	num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask;
895 	err = gve_rx_post_new_dqo_qpl_buf(rx);
896 	if (__predict_false(err != 0 &&
897 	    num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) {
898 		/*
899 		 * Resort to copying this fragment into a cluster mbuf
900 		 * when the above threshold is breached and repost the
901 		 * incoming buffer. If we cannot find cluster mbufs,
902 		 * just drop the packet (to repost its buffer).
903 		 */
904 		err = gve_rx_add_clmbuf_to_ctx(rx, ctx, buf,
905 		    buf_frag_num, frag_len);
906 		if (err != 0) {
907 			counter_enter();
908 			counter_u64_add_protected(
909 			    rx->stats.rx_dropped_pkt_buf_post_fail, 1);
910 			counter_exit();
911 			goto drop_frag;
912 		}
913 		gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num);
914 	} else {
915 		err = gve_rx_add_extmbuf_to_ctx(rx, ctx, buf,
916 		    buf_frag_num, frag_len);
917 		if (__predict_false(err != 0)) {
918 			counter_enter();
919 			counter_u64_add_protected(
920 			    rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1);
921 			counter_exit();
922 			goto drop_frag;
923 		}
924 	}
925 
926 	/*
927 	 * Both the counts need to be checked.
928 	 *
929 	 * num_nic_frags == 0 implies no pending completions
930 	 * but not all frags may have yet been posted.
931 	 *
932 	 * next_idx == 0 implies all frags have been posted
933 	 * but there might be pending completions.
934 	 */
935 	if (buf->num_nic_frags == 0 && buf->next_idx == 0)
936 		STAILQ_INSERT_TAIL(&rx->dqo.used_bufs, buf, stailq_entry);
937 
938 	if (is_last_frag) {
939 		gve_rx_input_mbuf_dqo(rx, compl_desc);
940 		(*work_done)++;
941 	}
942 	return;
943 
944 drop_frag:
945 	/* Clear the earlier frags if there were any */
946 	m_freem(ctx->mbuf_head);
947 	rx->ctx = (struct gve_rx_ctx){};
948 	/* Drop the rest of the pkt if there are more frags */
949 	ctx->drop_pkt = true;
950 	/* Reuse the dropped frag's buffer */
951 	gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num);
952 
953 	if (is_last_frag)
954 		goto drop_frag_clear_ctx;
955 	return;
956 
957 drop_frag_clear_ctx:
958 	counter_enter();
959 	counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1);
960 	counter_exit();
961 	m_freem(ctx->mbuf_head);
962 	rx->ctx = (struct gve_rx_ctx){};
963 }
964 
965 static uint8_t
966 gve_rx_get_gen_bit(uint8_t *desc)
967 {
968 	uint8_t byte;
969 
970 	/*
971 	 * Prevent generation bit from being read after the rest of the
972 	 * descriptor.
973 	 */
974 	byte = atomic_load_acq_8(desc + GVE_RX_DESC_DQO_GEN_BYTE_OFFSET);
975 	return ((byte & GVE_RX_DESC_DQO_GEN_BIT_MASK) != 0);
976 }
977 
978 static bool
979 gve_rx_cleanup_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, int budget)
980 {
981 	struct gve_rx_compl_desc_dqo *compl_desc;
982 	uint32_t work_done = 0;
983 
984 	NET_EPOCH_ASSERT();
985 
986 	while (work_done < budget) {
987 		bus_dmamap_sync(rx->dqo.compl_ring_mem.tag,
988 		    rx->dqo.compl_ring_mem.map,
989 		    BUS_DMASYNC_POSTREAD);
990 
991 		compl_desc = &rx->dqo.compl_ring[rx->dqo.tail];
992 		if (gve_rx_get_gen_bit((uint8_t *)compl_desc) ==
993 		    rx->dqo.cur_gen_bit)
994 			break;
995 
996 		rx->cnt++;
997 		rx->dqo.tail = (rx->dqo.tail + 1) & rx->dqo.mask;
998 		rx->dqo.cur_gen_bit ^= (rx->dqo.tail == 0);
999 
1000 		if (gve_is_qpl(priv))
1001 			gve_rx_dqo_qpl(priv, rx, compl_desc, &work_done);
1002 		else
1003 			gve_rx_dqo(priv, rx, compl_desc, &work_done);
1004 	}
1005 
1006 	if (work_done != 0)
1007 		tcp_lro_flush_all(&rx->lro);
1008 
1009 	gve_rx_post_buffers_dqo(rx, M_NOWAIT);
1010 	if (gve_is_qpl(priv))
1011 		gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/false);
1012 	return (work_done == budget);
1013 }
1014 
1015 void
1016 gve_rx_cleanup_tq_dqo(void *arg, int pending)
1017 {
1018 	struct gve_rx_ring *rx = arg;
1019 	struct gve_priv *priv = rx->com.priv;
1020 
1021 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
1022 		return;
1023 
1024 	if (gve_rx_cleanup_dqo(priv, rx, /*budget=*/64)) {
1025 		taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task);
1026 		return;
1027 	}
1028 
1029 	gve_db_bar_dqo_write_4(priv, rx->com.irq_db_offset,
1030 	    GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO);
1031 }
1032