xref: /freebsd/sys/dev/gve/gve_rx_dqo.c (revision 7d0873ebb83b19ba1e8a89e679470d885efe12e3)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2024 Google LLC
5  *
6  * Redistribution and use in source and binary forms, with or without modification,
7  * are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * 3. Neither the name of the copyright holder nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software without
18  *    specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
24  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
27  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 #include "gve.h"
32 #include "gve_adminq.h"
33 #include "gve_dqo.h"
34 
35 static void
36 gve_free_rx_mbufs_dqo(struct gve_rx_ring *rx)
37 {
38 	struct gve_rx_buf_dqo *buf;
39 	int i;
40 
41 	if (gve_is_qpl(rx->com.priv))
42 		return;
43 
44 	for (i = 0; i < rx->dqo.buf_cnt; i++) {
45 		buf = &rx->dqo.bufs[i];
46 		if (!buf->mbuf)
47 			continue;
48 
49 		bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap,
50 		    BUS_DMASYNC_POSTREAD);
51 		bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap);
52 		m_freem(buf->mbuf);
53 		buf->mbuf = NULL;
54 	}
55 }
56 
57 void
58 gve_rx_free_ring_dqo(struct gve_priv *priv, int i)
59 {
60 	struct gve_rx_ring *rx = &priv->rx[i];
61 	int j;
62 
63 	if (rx->dqo.compl_ring != NULL) {
64 		gve_dma_free_coherent(&rx->dqo.compl_ring_mem);
65 		rx->dqo.compl_ring = NULL;
66 	}
67 
68 	if (rx->dqo.desc_ring != NULL) {
69 		gve_dma_free_coherent(&rx->desc_ring_mem);
70 		rx->dqo.desc_ring = NULL;
71 	}
72 
73 	if (rx->dqo.bufs != NULL) {
74 		gve_free_rx_mbufs_dqo(rx);
75 
76 		if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) {
77 			for (j = 0; j < rx->dqo.buf_cnt; j++)
78 				if (rx->dqo.bufs[j].mapped)
79 					bus_dmamap_destroy(rx->dqo.buf_dmatag,
80 					    rx->dqo.bufs[j].dmamap);
81 		}
82 
83 		free(rx->dqo.bufs, M_GVE);
84 		rx->dqo.bufs = NULL;
85 	}
86 
87 	if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag)
88 		bus_dma_tag_destroy(rx->dqo.buf_dmatag);
89 }
90 
91 int
92 gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i)
93 {
94 	struct gve_rx_ring *rx = &priv->rx[i];
95 	int err;
96 	int j;
97 
98 	err = gve_dma_alloc_coherent(priv,
99 	    sizeof(struct gve_rx_desc_dqo) * priv->rx_desc_cnt,
100 	    CACHE_LINE_SIZE, &rx->desc_ring_mem);
101 	if (err != 0) {
102 		device_printf(priv->dev,
103 		    "Failed to alloc desc ring for rx ring %d", i);
104 		goto abort;
105 	}
106 	rx->dqo.desc_ring = rx->desc_ring_mem.cpu_addr;
107 	rx->dqo.mask = priv->rx_desc_cnt - 1;
108 
109 	err = gve_dma_alloc_coherent(priv,
110 	    sizeof(struct gve_rx_compl_desc_dqo) * priv->rx_desc_cnt,
111 	    CACHE_LINE_SIZE, &rx->dqo.compl_ring_mem);
112 	if (err != 0) {
113 		device_printf(priv->dev,
114 		    "Failed to alloc compl ring for rx ring %d", i);
115 		goto abort;
116 	}
117 	rx->dqo.compl_ring = rx->dqo.compl_ring_mem.cpu_addr;
118 	rx->dqo.mask = priv->rx_desc_cnt - 1;
119 
120 	rx->dqo.buf_cnt = gve_is_qpl(priv) ? GVE_RX_NUM_QPL_PAGES_DQO :
121 	    priv->rx_desc_cnt;
122 	rx->dqo.bufs = malloc(rx->dqo.buf_cnt * sizeof(struct gve_rx_buf_dqo),
123 	    M_GVE, M_WAITOK | M_ZERO);
124 
125 	if (gve_is_qpl(priv)) {
126 		rx->com.qpl = &priv->qpls[priv->tx_cfg.max_queues + i];
127 		if (rx->com.qpl == NULL) {
128 			device_printf(priv->dev, "No QPL left for rx ring %d", i);
129 			return (ENOMEM);
130 		}
131 		return (0);
132 	}
133 
134 	err = bus_dma_tag_create(
135 	    bus_get_dma_tag(priv->dev),	/* parent */
136 	    1, 0,			/* alignment, bounds */
137 	    BUS_SPACE_MAXADDR,		/* lowaddr */
138 	    BUS_SPACE_MAXADDR,		/* highaddr */
139 	    NULL, NULL,			/* filter, filterarg */
140 	    MCLBYTES,			/* maxsize */
141 	    1,				/* nsegments */
142 	    MCLBYTES,			/* maxsegsize */
143 	    0,				/* flags */
144 	    NULL,			/* lockfunc */
145 	    NULL,			/* lockarg */
146 	    &rx->dqo.buf_dmatag);
147 	if (err != 0) {
148 		device_printf(priv->dev,
149 		    "%s: bus_dma_tag_create failed: %d\n",
150 		    __func__, err);
151 		goto abort;
152 	}
153 
154 	for (j = 0; j < rx->dqo.buf_cnt; j++) {
155 		err = bus_dmamap_create(rx->dqo.buf_dmatag, 0,
156 		    &rx->dqo.bufs[j].dmamap);
157 		if (err != 0) {
158 			device_printf(priv->dev,
159 			    "err in creating rx buf dmamap %d: %d",
160 			    j, err);
161 			goto abort;
162 		}
163 		rx->dqo.bufs[j].mapped = true;
164 	}
165 
166 	return (0);
167 
168 abort:
169 	gve_rx_free_ring_dqo(priv, i);
170 	return (err);
171 }
172 
173 static void
174 gve_rx_clear_desc_ring_dqo(struct gve_rx_ring *rx)
175 {
176 	struct gve_ring_com *com = &rx->com;
177 	int entries;
178 	int i;
179 
180 	entries = com->priv->rx_desc_cnt;
181 	for (i = 0; i < entries; i++)
182 		rx->dqo.desc_ring[i] = (struct gve_rx_desc_dqo){};
183 
184 	bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map,
185 	    BUS_DMASYNC_PREWRITE);
186 }
187 
188 static void
189 gve_rx_clear_compl_ring_dqo(struct gve_rx_ring *rx)
190 {
191 	struct gve_ring_com *com = &rx->com;
192 	int i;
193 
194 	for (i = 0; i < com->priv->rx_desc_cnt; i++)
195 		rx->dqo.compl_ring[i] = (struct gve_rx_compl_desc_dqo){};
196 
197 	bus_dmamap_sync(rx->dqo.compl_ring_mem.tag, rx->dqo.compl_ring_mem.map,
198 	    BUS_DMASYNC_PREWRITE);
199 }
200 
201 void
202 gve_clear_rx_ring_dqo(struct gve_priv *priv, int i)
203 {
204 	struct gve_rx_ring *rx = &priv->rx[i];
205 	int j;
206 
207 	rx->fill_cnt = 0;
208 	rx->cnt = 0;
209 	rx->dqo.mask = priv->rx_desc_cnt - 1;
210 	rx->dqo.head = 0;
211 	rx->dqo.tail = 0;
212 	rx->dqo.cur_gen_bit = 0;
213 
214 	gve_rx_clear_desc_ring_dqo(rx);
215 	gve_rx_clear_compl_ring_dqo(rx);
216 
217 	gve_free_rx_mbufs_dqo(rx);
218 
219 	if (gve_is_qpl(priv)) {
220 		SLIST_INIT(&rx->dqo.free_bufs);
221 		STAILQ_INIT(&rx->dqo.used_bufs);
222 
223 		for (j = 0; j < rx->dqo.buf_cnt; j++) {
224 			struct gve_rx_buf_dqo *buf = &rx->dqo.bufs[j];
225 
226 			vm_page_t page = rx->com.qpl->pages[buf - rx->dqo.bufs];
227 			u_int ref_count = atomic_load_int(&page->ref_count);
228 
229 			/*
230 			 * An ifconfig down+up might see pages still in flight
231 			 * from the previous innings.
232 			 */
233 			if (VPRC_WIRE_COUNT(ref_count) == 1)
234 				SLIST_INSERT_HEAD(&rx->dqo.free_bufs,
235 				    buf, slist_entry);
236 			else
237 				STAILQ_INSERT_TAIL(&rx->dqo.used_bufs,
238 				    buf, stailq_entry);
239 
240 			buf->num_nic_frags = 0;
241 			buf->next_idx = 0;
242 		}
243 	} else {
244 		SLIST_INIT(&rx->dqo.free_bufs);
245 		for (j = 0; j < rx->dqo.buf_cnt; j++)
246 			SLIST_INSERT_HEAD(&rx->dqo.free_bufs,
247 			    &rx->dqo.bufs[j], slist_entry);
248 	}
249 }
250 
251 int
252 gve_rx_intr_dqo(void *arg)
253 {
254 	struct gve_rx_ring *rx = arg;
255 	struct gve_priv *priv = rx->com.priv;
256 	struct gve_ring_com *com = &rx->com;
257 
258 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
259 		return (FILTER_STRAY);
260 
261 	/* Interrupts are automatically masked */
262 	taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
263 	return (FILTER_HANDLED);
264 }
265 
266 static void
267 gve_rx_advance_head_dqo(struct gve_rx_ring *rx)
268 {
269 	rx->dqo.head = (rx->dqo.head + 1) & rx->dqo.mask;
270 	rx->fill_cnt++; /* rx->fill_cnt is just a sysctl counter */
271 
272 	if ((rx->dqo.head & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) {
273 		bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map,
274 		    BUS_DMASYNC_PREWRITE);
275 		gve_db_bar_dqo_write_4(rx->com.priv, rx->com.db_offset,
276 		    rx->dqo.head);
277 	}
278 }
279 
280 static void
281 gve_rx_post_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf)
282 {
283 	struct gve_rx_desc_dqo *desc;
284 
285 	bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap,
286 	    BUS_DMASYNC_PREREAD);
287 
288 	desc = &rx->dqo.desc_ring[rx->dqo.head];
289 	desc->buf_id = htole16(buf - rx->dqo.bufs);
290 	desc->buf_addr = htole64(buf->addr);
291 
292 	gve_rx_advance_head_dqo(rx);
293 }
294 
295 static int
296 gve_rx_post_new_mbuf_dqo(struct gve_rx_ring *rx, int how)
297 {
298 	struct gve_rx_buf_dqo *buf;
299 	bus_dma_segment_t segs[1];
300 	int nsegs;
301 	int err;
302 
303 	buf = SLIST_FIRST(&rx->dqo.free_bufs);
304 	if (__predict_false(!buf)) {
305 		device_printf(rx->com.priv->dev,
306 		    "Unexpected empty free bufs list\n");
307 		return (ENOBUFS);
308 	}
309 	SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry);
310 
311 	buf->mbuf = m_getcl(how, MT_DATA, M_PKTHDR);
312 	if (__predict_false(!buf->mbuf)) {
313 		err = ENOMEM;
314 		counter_enter();
315 		counter_u64_add_protected(rx->stats.rx_mbuf_mclget_null, 1);
316 		counter_exit();
317 		goto abort_with_buf;
318 	}
319 	buf->mbuf->m_len = MCLBYTES;
320 
321 	err = bus_dmamap_load_mbuf_sg(rx->dqo.buf_dmatag, buf->dmamap,
322 	    buf->mbuf, segs, &nsegs, BUS_DMA_NOWAIT);
323 	KASSERT(nsegs == 1, ("dma segs for a cluster mbuf is not 1"));
324 	if (__predict_false(err != 0)) {
325 		counter_enter();
326 		counter_u64_add_protected(rx->stats.rx_mbuf_dmamap_err, 1);
327 		counter_exit();
328 		goto abort_with_mbuf;
329 	}
330 	buf->addr = segs[0].ds_addr;
331 
332 	gve_rx_post_buf_dqo(rx, buf);
333 	return (0);
334 
335 abort_with_mbuf:
336 	m_freem(buf->mbuf);
337 	buf->mbuf = NULL;
338 abort_with_buf:
339 	SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry);
340 	return (err);
341 }
342 
343 static struct gve_dma_handle *
344 gve_get_page_dma_handle(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf)
345 {
346 	return (&(rx->com.qpl->dmas[buf - rx->dqo.bufs]));
347 }
348 
349 static void
350 gve_rx_post_qpl_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf,
351     uint8_t frag_num)
352 {
353 	struct gve_rx_desc_dqo *desc = &rx->dqo.desc_ring[rx->dqo.head];
354 	union gve_rx_qpl_buf_id_dqo composed_id;
355 	struct gve_dma_handle *page_dma_handle;
356 
357 	composed_id.buf_id = buf - rx->dqo.bufs;
358 	composed_id.frag_num = frag_num;
359 	desc->buf_id = htole16(composed_id.all);
360 
361 	page_dma_handle = gve_get_page_dma_handle(rx, buf);
362 	bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map,
363 	    BUS_DMASYNC_PREREAD);
364 	desc->buf_addr = htole64(page_dma_handle->bus_addr +
365 	    frag_num * GVE_DEFAULT_RX_BUFFER_SIZE);
366 
367 	buf->num_nic_frags++;
368 	gve_rx_advance_head_dqo(rx);
369 }
370 
371 static void
372 gve_rx_maybe_extract_from_used_bufs(struct gve_rx_ring *rx, bool just_one)
373 {
374 	struct gve_rx_buf_dqo *hol_blocker = NULL;
375 	struct gve_rx_buf_dqo *buf;
376 	u_int ref_count;
377 	vm_page_t page;
378 
379 	while (true) {
380 		buf = STAILQ_FIRST(&rx->dqo.used_bufs);
381 		if (__predict_false(buf == NULL))
382 			break;
383 
384 		page = rx->com.qpl->pages[buf - rx->dqo.bufs];
385 		ref_count = atomic_load_int(&page->ref_count);
386 
387 		if (VPRC_WIRE_COUNT(ref_count) != 1) {
388 			/* Account for one head-of-line blocker */
389 			if (hol_blocker != NULL)
390 				break;
391 			hol_blocker = buf;
392 			STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs,
393 			    stailq_entry);
394 			continue;
395 		}
396 
397 		STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs,
398 		    stailq_entry);
399 		SLIST_INSERT_HEAD(&rx->dqo.free_bufs,
400 		    buf, slist_entry);
401 		if (just_one)
402 			break;
403 	}
404 
405 	if (hol_blocker != NULL)
406 		STAILQ_INSERT_HEAD(&rx->dqo.used_bufs,
407 		    hol_blocker, stailq_entry);
408 }
409 
410 static int
411 gve_rx_post_new_dqo_qpl_buf(struct gve_rx_ring *rx)
412 {
413 	struct gve_rx_buf_dqo *buf;
414 
415 	buf = SLIST_FIRST(&rx->dqo.free_bufs);
416 	if (__predict_false(buf == NULL)) {
417 		gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/true);
418 		buf = SLIST_FIRST(&rx->dqo.free_bufs);
419 		if (__predict_false(buf == NULL))
420 			return (ENOBUFS);
421 	}
422 
423 	gve_rx_post_qpl_buf_dqo(rx, buf, buf->next_idx);
424 	if (buf->next_idx == GVE_DQ_NUM_FRAGS_IN_PAGE - 1)
425 		buf->next_idx = 0;
426 	else
427 		buf->next_idx++;
428 
429 	/*
430 	 * We have posted all the frags in this buf to the NIC.
431 	 * - buf will enter used_bufs once the last completion arrives.
432 	 * - It will renter free_bufs in gve_rx_maybe_extract_from_used_bufs
433 	 *   when its wire count drops back to 1.
434 	 */
435 	if (buf->next_idx == 0)
436 		SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry);
437 	return (0);
438 }
439 
440 static void
441 gve_rx_post_buffers_dqo(struct gve_rx_ring *rx, int how)
442 {
443 	uint32_t num_pending_bufs;
444 	uint32_t num_to_post;
445 	uint32_t i;
446 	int err;
447 
448 	num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask;
449 	num_to_post = rx->dqo.mask - num_pending_bufs;
450 
451 	for (i = 0; i < num_to_post; i++) {
452 		if (gve_is_qpl(rx->com.priv))
453 			err = gve_rx_post_new_dqo_qpl_buf(rx);
454 		else
455 			err = gve_rx_post_new_mbuf_dqo(rx, how);
456 		if (err)
457 			break;
458 	}
459 }
460 
461 void
462 gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx)
463 {
464 	gve_rx_post_buffers_dqo(rx, M_WAITOK);
465 }
466 
467 static void
468 gve_rx_set_hashtype_dqo(struct mbuf *mbuf, struct gve_ptype *ptype, bool *is_tcp)
469 {
470 	switch (ptype->l3_type) {
471 	case GVE_L3_TYPE_IPV4:
472 		switch (ptype->l4_type) {
473 		case GVE_L4_TYPE_TCP:
474 			*is_tcp = true;
475 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4);
476 			break;
477 		case GVE_L4_TYPE_UDP:
478 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4);
479 			break;
480 		default:
481 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4);
482 		}
483 		break;
484 	case GVE_L3_TYPE_IPV6:
485 		switch (ptype->l4_type) {
486 		case GVE_L4_TYPE_TCP:
487 			*is_tcp = true;
488 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6);
489 			break;
490 		case GVE_L4_TYPE_UDP:
491 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6);
492 			break;
493 		default:
494 			M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6);
495 		}
496 		break;
497 	default:
498 		M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE_HASH);
499 	}
500 }
501 
502 static void
503 gve_rx_set_csum_flags_dqo(struct mbuf *mbuf,
504     struct gve_rx_compl_desc_dqo *desc,
505     struct gve_ptype *ptype)
506 {
507 	/* HW did not identify and process L3 and L4 headers. */
508 	if (__predict_false(!desc->l3_l4_processed))
509 		return;
510 
511 	if (ptype->l3_type == GVE_L3_TYPE_IPV4) {
512 		if (__predict_false(desc->csum_ip_err ||
513 		    desc->csum_external_ip_err))
514 			return;
515 	} else if (ptype->l3_type == GVE_L3_TYPE_IPV6) {
516 		/* Checksum should be skipped if this flag is set. */
517 		if (__predict_false(desc->ipv6_ex_add))
518 			return;
519 	}
520 
521 	if (__predict_false(desc->csum_l4_err))
522 		return;
523 
524 	switch (ptype->l4_type) {
525 	case GVE_L4_TYPE_TCP:
526 	case GVE_L4_TYPE_UDP:
527 	case GVE_L4_TYPE_ICMP:
528 	case GVE_L4_TYPE_SCTP:
529 		mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
530 					    CSUM_IP_VALID |
531 					    CSUM_DATA_VALID |
532 					    CSUM_PSEUDO_HDR;
533 		mbuf->m_pkthdr.csum_data = 0xffff;
534 		break;
535 	default:
536 		break;
537 	}
538 }
539 
540 static void
541 gve_rx_input_mbuf_dqo(struct gve_rx_ring *rx,
542     struct gve_rx_compl_desc_dqo *compl_desc)
543 {
544 	struct mbuf *mbuf = rx->ctx.mbuf_head;
545 	if_t ifp = rx->com.priv->ifp;
546 	struct gve_ptype *ptype;
547 	bool do_if_input = true;
548 	bool is_tcp = false;
549 
550 	ptype = &rx->com.priv->ptype_lut_dqo->ptypes[compl_desc->packet_type];
551 	gve_rx_set_hashtype_dqo(mbuf, ptype, &is_tcp);
552 	mbuf->m_pkthdr.flowid = le32toh(compl_desc->hash);
553 	gve_rx_set_csum_flags_dqo(mbuf, compl_desc, ptype);
554 
555 	mbuf->m_pkthdr.rcvif = ifp;
556 	mbuf->m_pkthdr.len = rx->ctx.total_size;
557 
558 	if (((if_getcapenable(rx->com.priv->ifp) & IFCAP_LRO) != 0) &&
559 	    is_tcp &&
560 	    (rx->lro.lro_cnt != 0) &&
561 	    (tcp_lro_rx(&rx->lro, mbuf, 0) == 0))
562 		do_if_input = false;
563 
564 	if (do_if_input)
565 		if_input(ifp, mbuf);
566 
567 	counter_enter();
568 	counter_u64_add_protected(rx->stats.rbytes, rx->ctx.total_size);
569 	counter_u64_add_protected(rx->stats.rpackets, 1);
570 	counter_exit();
571 
572 	rx->ctx = (struct gve_rx_ctx){};
573 }
574 
575 static int
576 gve_rx_copybreak_dqo(struct gve_rx_ring *rx, void *va,
577     struct gve_rx_compl_desc_dqo *compl_desc, uint16_t frag_len)
578 {
579 	struct mbuf *mbuf;
580 
581 	mbuf = m_get2(frag_len, M_NOWAIT, MT_DATA, M_PKTHDR);
582 	if (__predict_false(mbuf == NULL))
583 		return (ENOMEM);
584 
585 	counter_enter();
586 	counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1);
587 	counter_exit();
588 
589 	m_copyback(mbuf, 0, frag_len, va);
590 	mbuf->m_len = frag_len;
591 
592 	rx->ctx.mbuf_head = mbuf;
593 	rx->ctx.mbuf_tail = mbuf;
594 	rx->ctx.total_size += frag_len;
595 
596 	gve_rx_input_mbuf_dqo(rx, compl_desc);
597 	return (0);
598 }
599 
600 static void
601 gve_rx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx,
602     struct gve_rx_compl_desc_dqo *compl_desc,
603     int *work_done)
604 {
605 	bool is_last_frag = compl_desc->end_of_packet != 0;
606 	struct gve_rx_ctx *ctx = &rx->ctx;
607 	struct gve_rx_buf_dqo *buf;
608 	uint32_t num_pending_bufs;
609 	uint16_t frag_len;
610 	uint16_t buf_id;
611 	int err;
612 
613 	buf_id = le16toh(compl_desc->buf_id);
614 	if (__predict_false(buf_id >= rx->dqo.buf_cnt)) {
615 		device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n",
616 		    buf_id, rx->com.id);
617 		gve_schedule_reset(priv);
618 		goto drop_frag_clear_ctx;
619 	}
620 	buf = &rx->dqo.bufs[buf_id];
621 	if (__predict_false(buf->mbuf == NULL)) {
622 		device_printf(priv->dev, "Spurious completion for buf id %d on rxq %d, issuing reset\n",
623 		    buf_id, rx->com.id);
624 		gve_schedule_reset(priv);
625 		goto drop_frag_clear_ctx;
626 	}
627 
628 	if (__predict_false(ctx->drop_pkt))
629 		goto drop_frag;
630 
631 	if (__predict_false(compl_desc->rx_error)) {
632 		counter_enter();
633 		counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1);
634 		counter_exit();
635 		goto drop_frag;
636 	}
637 
638 	bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap,
639 	    BUS_DMASYNC_POSTREAD);
640 
641 	frag_len = compl_desc->packet_len;
642 	if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) {
643 		err = gve_rx_copybreak_dqo(rx, mtod(buf->mbuf, char*),
644 		    compl_desc, frag_len);
645 		if (__predict_false(err != 0))
646 			goto drop_frag;
647 		(*work_done)++;
648 		gve_rx_post_buf_dqo(rx, buf);
649 		return;
650 	}
651 
652 	/*
653 	 * Although buffer completions may arrive out of order, buffer
654 	 * descriptors are consumed by the NIC in order. That is, the
655 	 * buffer at desc_ring[tail] might not be the buffer we got the
656 	 * completion compl_ring[tail] for: but we know that desc_ring[tail]
657 	 * has already been read by the NIC.
658 	 */
659 	num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask;
660 
661 	/*
662 	 * For every fragment received, try to post a new buffer.
663 	 *
664 	 * Failures are okay but only so long as the number of outstanding
665 	 * buffers is above a threshold.
666 	 *
667 	 * Beyond that we drop new packets to reuse their buffers.
668 	 * Without ensuring a minimum number of buffers for the NIC to
669 	 * put packets in, we run the risk of getting the queue stuck
670 	 * for good.
671 	 */
672 	err = gve_rx_post_new_mbuf_dqo(rx, M_NOWAIT);
673 	if (__predict_false(err != 0 &&
674 	    num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) {
675 		counter_enter();
676 		counter_u64_add_protected(
677 		    rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1);
678 		counter_exit();
679 		goto drop_frag;
680 	}
681 
682 	buf->mbuf->m_len = frag_len;
683 	ctx->total_size += frag_len;
684 	if (ctx->mbuf_tail == NULL) {
685 		ctx->mbuf_head = buf->mbuf;
686 		ctx->mbuf_tail = buf->mbuf;
687 	} else {
688 		buf->mbuf->m_flags &= ~M_PKTHDR;
689 		ctx->mbuf_tail->m_next = buf->mbuf;
690 		ctx->mbuf_tail = buf->mbuf;
691 	}
692 
693 	/*
694 	 * Disassociate the mbuf from buf and surrender buf to the free list to
695 	 * be used by a future mbuf.
696 	 */
697 	bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap);
698 	buf->mbuf = NULL;
699 	buf->addr = 0;
700 	SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry);
701 
702 	if (is_last_frag) {
703 		gve_rx_input_mbuf_dqo(rx, compl_desc);
704 		(*work_done)++;
705 	}
706 	return;
707 
708 drop_frag:
709 	/* Clear the earlier frags if there were any */
710 	m_freem(ctx->mbuf_head);
711 	rx->ctx = (struct gve_rx_ctx){};
712 	/* Drop the rest of the pkt if there are more frags */
713 	ctx->drop_pkt = true;
714 	/* Reuse the dropped frag's buffer */
715 	gve_rx_post_buf_dqo(rx, buf);
716 
717 	if (is_last_frag)
718 		goto drop_frag_clear_ctx;
719 	return;
720 
721 drop_frag_clear_ctx:
722 	counter_enter();
723 	counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1);
724 	counter_exit();
725 	m_freem(ctx->mbuf_head);
726 	rx->ctx = (struct gve_rx_ctx){};
727 }
728 
729 static void *
730 gve_get_cpu_addr_for_qpl_buf(struct gve_rx_ring *rx,
731     struct gve_rx_buf_dqo *buf, uint8_t buf_frag_num)
732 {
733 	int page_idx = buf - rx->dqo.bufs;
734 	void *va = rx->com.qpl->dmas[page_idx].cpu_addr;
735 
736 	va = (char *)va + (buf_frag_num * GVE_DEFAULT_RX_BUFFER_SIZE);
737 	return (va);
738 }
739 
740 static int
741 gve_rx_add_clmbuf_to_ctx(struct gve_rx_ring *rx,
742     struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf,
743     uint8_t buf_frag_num, uint16_t frag_len)
744 {
745 	void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num);
746 	struct mbuf *mbuf;
747 
748 	if (ctx->mbuf_tail == NULL) {
749 		mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
750 		if (mbuf == NULL)
751 			return (ENOMEM);
752 		ctx->mbuf_head = mbuf;
753 		ctx->mbuf_tail = mbuf;
754 	} else {
755 		mbuf = m_getcl(M_NOWAIT, MT_DATA, 0);
756 		if (mbuf == NULL)
757 			return (ENOMEM);
758 		ctx->mbuf_tail->m_next = mbuf;
759 		ctx->mbuf_tail = mbuf;
760 	}
761 
762 	mbuf->m_len = frag_len;
763 	ctx->total_size += frag_len;
764 
765 	m_copyback(mbuf, 0, frag_len, va);
766 	counter_enter();
767 	counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1);
768 	counter_exit();
769 	return (0);
770 }
771 
772 static int
773 gve_rx_add_extmbuf_to_ctx(struct gve_rx_ring *rx,
774     struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf,
775     uint8_t buf_frag_num, uint16_t frag_len)
776 {
777 	struct mbuf *mbuf;
778 	void *page_addr;
779 	vm_page_t page;
780 	int page_idx;
781 	void *va;
782 
783 	if (ctx->mbuf_tail == NULL) {
784 		mbuf = m_gethdr(M_NOWAIT, MT_DATA);
785 		if (mbuf == NULL)
786 			return (ENOMEM);
787 		ctx->mbuf_head = mbuf;
788 		ctx->mbuf_tail = mbuf;
789 	} else {
790 		mbuf = m_get(M_NOWAIT, MT_DATA);
791 		if (mbuf == NULL)
792 			return (ENOMEM);
793 		ctx->mbuf_tail->m_next = mbuf;
794 		ctx->mbuf_tail = mbuf;
795 	}
796 
797 	mbuf->m_len = frag_len;
798 	ctx->total_size += frag_len;
799 
800 	page_idx = buf - rx->dqo.bufs;
801 	page = rx->com.qpl->pages[page_idx];
802 	page_addr = rx->com.qpl->dmas[page_idx].cpu_addr;
803 	va = (char *)page_addr + (buf_frag_num * GVE_DEFAULT_RX_BUFFER_SIZE);
804 
805 	/*
806 	 * Grab an extra ref to the page so that gve_mextadd_free
807 	 * does not end up freeing the page while the interface exists.
808 	 */
809 	vm_page_wire(page);
810 
811 	counter_enter();
812 	counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1);
813 	counter_exit();
814 
815 	MEXTADD(mbuf, va, frag_len,
816 	    gve_mextadd_free, page, page_addr,
817 	    0, EXT_NET_DRV);
818 	return (0);
819 }
820 
821 static void
822 gve_rx_dqo_qpl(struct gve_priv *priv, struct gve_rx_ring *rx,
823     struct gve_rx_compl_desc_dqo *compl_desc,
824     int *work_done)
825 {
826 	bool is_last_frag = compl_desc->end_of_packet != 0;
827 	union gve_rx_qpl_buf_id_dqo composed_id;
828 	struct gve_dma_handle *page_dma_handle;
829 	struct gve_rx_ctx *ctx = &rx->ctx;
830 	struct gve_rx_buf_dqo *buf;
831 	uint32_t num_pending_bufs;
832 	uint8_t buf_frag_num;
833 	uint16_t frag_len;
834 	uint16_t buf_id;
835 	int err;
836 
837 	composed_id.all = le16toh(compl_desc->buf_id);
838 	buf_id = composed_id.buf_id;
839 	buf_frag_num = composed_id.frag_num;
840 
841 	if (__predict_false(buf_id >= rx->dqo.buf_cnt)) {
842 		device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n",
843 		    buf_id, rx->com.id);
844 		gve_schedule_reset(priv);
845 		goto drop_frag_clear_ctx;
846 	}
847 	buf = &rx->dqo.bufs[buf_id];
848 	if (__predict_false(buf->num_nic_frags == 0 ||
849 	    buf_frag_num > GVE_DQ_NUM_FRAGS_IN_PAGE - 1)) {
850 		device_printf(priv->dev, "Spurious compl for buf id %d on rxq %d "
851 		    "with buf_frag_num %d and num_nic_frags %d, issuing reset\n",
852 		    buf_id, rx->com.id, buf_frag_num, buf->num_nic_frags);
853 		gve_schedule_reset(priv);
854 		goto drop_frag_clear_ctx;
855 	}
856 
857 	buf->num_nic_frags--;
858 
859 	if (__predict_false(ctx->drop_pkt))
860 		goto drop_frag;
861 
862 	if (__predict_false(compl_desc->rx_error)) {
863 		counter_enter();
864 		counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1);
865 		counter_exit();
866 		goto drop_frag;
867 	}
868 
869 	page_dma_handle = gve_get_page_dma_handle(rx, buf);
870 	bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map,
871 	    BUS_DMASYNC_POSTREAD);
872 
873 	frag_len = compl_desc->packet_len;
874 	if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) {
875 		void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num);
876 
877 		err = gve_rx_copybreak_dqo(rx, va, compl_desc, frag_len);
878 		if (__predict_false(err != 0))
879 			goto drop_frag;
880 		(*work_done)++;
881 		gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num);
882 		return;
883 	}
884 
885 	num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask;
886 	err = gve_rx_post_new_dqo_qpl_buf(rx);
887 	if (__predict_false(err != 0 &&
888 	    num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) {
889 		/*
890 		 * Resort to copying this fragment into a cluster mbuf
891 		 * when the above threshold is breached and repost the
892 		 * incoming buffer. If we cannot find cluster mbufs,
893 		 * just drop the packet (to repost its buffer).
894 		 */
895 		err = gve_rx_add_clmbuf_to_ctx(rx, ctx, buf,
896 		    buf_frag_num, frag_len);
897 		if (err != 0) {
898 			counter_enter();
899 			counter_u64_add_protected(
900 			    rx->stats.rx_dropped_pkt_buf_post_fail, 1);
901 			counter_exit();
902 			goto drop_frag;
903 		}
904 		gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num);
905 	} else {
906 		err = gve_rx_add_extmbuf_to_ctx(rx, ctx, buf,
907 		    buf_frag_num, frag_len);
908 		if (__predict_false(err != 0)) {
909 			counter_enter();
910 			counter_u64_add_protected(
911 			    rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1);
912 			counter_exit();
913 			goto drop_frag;
914 		}
915 	}
916 
917 	/*
918 	 * Both the counts need to be checked.
919 	 *
920 	 * num_nic_frags == 0 implies no pending completions
921 	 * but not all frags may have yet been posted.
922 	 *
923 	 * next_idx == 0 implies all frags have been posted
924 	 * but there might be pending completions.
925 	 */
926 	if (buf->num_nic_frags == 0 && buf->next_idx == 0)
927 		STAILQ_INSERT_TAIL(&rx->dqo.used_bufs, buf, stailq_entry);
928 
929 	if (is_last_frag) {
930 		gve_rx_input_mbuf_dqo(rx, compl_desc);
931 		(*work_done)++;
932 	}
933 	return;
934 
935 drop_frag:
936 	/* Clear the earlier frags if there were any */
937 	m_freem(ctx->mbuf_head);
938 	rx->ctx = (struct gve_rx_ctx){};
939 	/* Drop the rest of the pkt if there are more frags */
940 	ctx->drop_pkt = true;
941 	/* Reuse the dropped frag's buffer */
942 	gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num);
943 
944 	if (is_last_frag)
945 		goto drop_frag_clear_ctx;
946 	return;
947 
948 drop_frag_clear_ctx:
949 	counter_enter();
950 	counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1);
951 	counter_exit();
952 	m_freem(ctx->mbuf_head);
953 	rx->ctx = (struct gve_rx_ctx){};
954 }
955 
956 static bool
957 gve_rx_cleanup_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, int budget)
958 {
959 	struct gve_rx_compl_desc_dqo *compl_desc;
960 	uint32_t work_done = 0;
961 
962 	NET_EPOCH_ASSERT();
963 
964 	while (work_done < budget) {
965 		bus_dmamap_sync(rx->dqo.compl_ring_mem.tag, rx->dqo.compl_ring_mem.map,
966 		    BUS_DMASYNC_POSTREAD);
967 
968 		compl_desc = &rx->dqo.compl_ring[rx->dqo.tail];
969 		if (compl_desc->generation == rx->dqo.cur_gen_bit)
970 			break;
971 		/*
972 		 * Prevent generation bit from being read after the rest of the
973 		 * descriptor.
974 		 */
975 		rmb();
976 
977 		rx->cnt++;
978 		rx->dqo.tail = (rx->dqo.tail + 1) & rx->dqo.mask;
979 		rx->dqo.cur_gen_bit ^= (rx->dqo.tail == 0);
980 
981 		if (gve_is_qpl(priv))
982 			gve_rx_dqo_qpl(priv, rx, compl_desc, &work_done);
983 		else
984 			gve_rx_dqo(priv, rx, compl_desc, &work_done);
985 	}
986 
987 	if (work_done != 0)
988 		tcp_lro_flush_all(&rx->lro);
989 
990 	gve_rx_post_buffers_dqo(rx, M_NOWAIT);
991 	if (gve_is_qpl(priv))
992 		gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/false);
993 	return (work_done == budget);
994 }
995 
996 void
997 gve_rx_cleanup_tq_dqo(void *arg, int pending)
998 {
999 	struct gve_rx_ring *rx = arg;
1000 	struct gve_priv *priv = rx->com.priv;
1001 
1002 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
1003 		return;
1004 
1005 	if (gve_rx_cleanup_dqo(priv, rx, /*budget=*/64)) {
1006 		taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task);
1007 		return;
1008 	}
1009 
1010 	gve_db_bar_dqo_write_4(priv, rx->com.irq_db_offset,
1011 	    GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO);
1012 }
1013