xref: /freebsd/sys/dev/gve/gve_tx.c (revision 031800c786823a9ad4c4d2f79f217d42dad3f5d1)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2023-2024 Google LLC
5  *
6  * Redistribution and use in source and binary forms, with or without modification,
7  * are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * 3. Neither the name of the copyright holder nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software without
18  *    specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
24  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
27  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 #include "gve.h"
32 #include "gve_adminq.h"
33 #include "gve_dqo.h"
34 
35 #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182
36 
37 static int
gve_tx_fifo_init(struct gve_priv * priv,struct gve_tx_ring * tx)38 gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx)
39 {
40 	struct gve_queue_page_list *qpl = tx->com.qpl;
41 	struct gve_tx_fifo *fifo = &tx->fifo;
42 
43 	fifo->size = qpl->num_pages * PAGE_SIZE;
44 	fifo->base = qpl->kva;
45 	atomic_store_int(&fifo->available, fifo->size);
46 	fifo->head = 0;
47 
48 	return (0);
49 }
50 
51 static void
gve_tx_free_ring_gqi(struct gve_priv * priv,int i)52 gve_tx_free_ring_gqi(struct gve_priv *priv, int i)
53 {
54 	struct gve_tx_ring *tx = &priv->tx[i];
55 
56 	if (tx->desc_ring != NULL) {
57 		gve_dma_free_coherent(&tx->desc_ring_mem);
58 		tx->desc_ring = NULL;
59 	}
60 
61 	if (tx->info != NULL) {
62 		free(tx->info, M_GVE);
63 		tx->info = NULL;
64 	}
65 }
66 
67 static void
gve_tx_free_ring(struct gve_priv * priv,int i)68 gve_tx_free_ring(struct gve_priv *priv, int i)
69 {
70 	struct gve_tx_ring *tx = &priv->tx[i];
71 	struct gve_ring_com *com = &tx->com;
72 
73 	/* Safe to call even if never alloced */
74 	gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
75 
76 	if (mtx_initialized(&tx->ring_mtx))
77 		mtx_destroy(&tx->ring_mtx);
78 
79 	if (com->q_resources != NULL) {
80 		gve_dma_free_coherent(&com->q_resources_mem);
81 		com->q_resources = NULL;
82 	}
83 
84 	if (tx->br != NULL) {
85 		buf_ring_free(tx->br, M_DEVBUF);
86 		tx->br = NULL;
87 	}
88 
89 	if (gve_is_gqi(priv))
90 		gve_tx_free_ring_gqi(priv, i);
91 	else
92 		gve_tx_free_ring_dqo(priv, i);
93 }
94 
95 static int
gve_tx_alloc_ring_gqi(struct gve_priv * priv,int i)96 gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i)
97 {
98 	struct gve_tx_ring *tx = &priv->tx[i];
99 	struct gve_ring_com *com = &tx->com;
100 	int err;
101 
102 	err = gve_dma_alloc_coherent(priv,
103 	    sizeof(union gve_tx_desc) * priv->tx_desc_cnt,
104 	    CACHE_LINE_SIZE, &tx->desc_ring_mem);
105 	if (err != 0) {
106 		device_printf(priv->dev,
107 		    "Failed to alloc desc ring for tx ring %d", i);
108 		goto abort;
109 	}
110 	tx->desc_ring = tx->desc_ring_mem.cpu_addr;
111 
112 	com->qpl = &priv->qpls[i];
113 	if (com->qpl == NULL) {
114 		device_printf(priv->dev, "No QPL left for tx ring %d\n", i);
115 		err = ENOMEM;
116 		goto abort;
117 	}
118 
119 	err = gve_tx_fifo_init(priv, tx);
120 	if (err != 0)
121 		goto abort;
122 
123 	tx->info = malloc(
124 	    sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt,
125 	    M_GVE, M_WAITOK | M_ZERO);
126 	return (0);
127 
128 abort:
129 	gve_tx_free_ring_gqi(priv, i);
130 	return (err);
131 }
132 
133 static int
gve_tx_alloc_ring(struct gve_priv * priv,int i)134 gve_tx_alloc_ring(struct gve_priv *priv, int i)
135 {
136 	struct gve_tx_ring *tx = &priv->tx[i];
137 	struct gve_ring_com *com = &tx->com;
138 	char mtx_name[16];
139 	int err;
140 
141 	com->priv = priv;
142 	com->id = i;
143 
144 	if (gve_is_gqi(priv))
145 		err = gve_tx_alloc_ring_gqi(priv, i);
146 	else
147 		err = gve_tx_alloc_ring_dqo(priv, i);
148 	if (err != 0)
149 		goto abort;
150 
151 	sprintf(mtx_name, "gvetx%d", i);
152 	mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF);
153 
154 	tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF,
155 	    M_WAITOK, &tx->ring_mtx);
156 
157 	gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
158 
159 	err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources),
160 	    PAGE_SIZE, &com->q_resources_mem);
161 	if (err != 0) {
162 		device_printf(priv->dev,
163 		    "Failed to alloc queue resources for tx ring %d", i);
164 		goto abort;
165 	}
166 	com->q_resources = com->q_resources_mem.cpu_addr;
167 
168 	return (0);
169 
170 abort:
171 	gve_tx_free_ring(priv, i);
172 	return (err);
173 }
174 
175 int
gve_alloc_tx_rings(struct gve_priv * priv)176 gve_alloc_tx_rings(struct gve_priv *priv)
177 {
178 	int err = 0;
179 	int i;
180 
181 	priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues,
182 	    M_GVE, M_WAITOK | M_ZERO);
183 
184 	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
185 		err = gve_tx_alloc_ring(priv, i);
186 		if (err != 0)
187 			goto free_rings;
188 
189 	}
190 
191 	return (0);
192 
193 free_rings:
194 	while (i--)
195 		gve_tx_free_ring(priv, i);
196 	free(priv->tx, M_GVE);
197 	return (err);
198 }
199 
200 void
gve_free_tx_rings(struct gve_priv * priv)201 gve_free_tx_rings(struct gve_priv *priv)
202 {
203 	int i;
204 
205 	for (i = 0; i < priv->tx_cfg.num_queues; i++)
206 		gve_tx_free_ring(priv, i);
207 
208 	free(priv->tx, M_GVE);
209 }
210 
211 static void
gve_tx_clear_desc_ring(struct gve_tx_ring * tx)212 gve_tx_clear_desc_ring(struct gve_tx_ring *tx)
213 {
214 	struct gve_ring_com *com = &tx->com;
215 	int i;
216 
217 	for (i = 0; i < com->priv->tx_desc_cnt; i++) {
218 		tx->desc_ring[i] = (union gve_tx_desc){};
219 		tx->info[i] = (struct gve_tx_buffer_state){};
220 	}
221 
222 	bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
223 	    BUS_DMASYNC_PREWRITE);
224 }
225 
226 static void
gve_clear_tx_ring(struct gve_priv * priv,int i)227 gve_clear_tx_ring(struct gve_priv *priv, int i)
228 {
229 	struct gve_tx_ring *tx = &priv->tx[i];
230 	struct gve_tx_fifo *fifo = &tx->fifo;
231 
232 	tx->req = 0;
233 	tx->done = 0;
234 	tx->mask = priv->tx_desc_cnt - 1;
235 
236 	atomic_store_int(&fifo->available, fifo->size);
237 	fifo->head = 0;
238 
239 	gve_tx_clear_desc_ring(tx);
240 }
241 
242 static void
gve_start_tx_ring(struct gve_priv * priv,int i)243 gve_start_tx_ring(struct gve_priv *priv, int i)
244 {
245 	struct gve_tx_ring *tx = &priv->tx[i];
246 	struct gve_ring_com *com = &tx->com;
247 
248 	atomic_store_bool(&tx->stopped, false);
249 	if (gve_is_gqi(priv))
250 		NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx);
251 	else
252 		NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx);
253 	com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK,
254 	    taskqueue_thread_enqueue, &com->cleanup_tq);
255 	taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d",
256 	    device_get_nameunit(priv->dev), i);
257 
258 	TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx);
259 	tx->xmit_tq = taskqueue_create_fast("gve tx xmit",
260 	    M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq);
261 	taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit",
262 	    device_get_nameunit(priv->dev), i);
263 }
264 
265 int
gve_create_tx_rings(struct gve_priv * priv)266 gve_create_tx_rings(struct gve_priv *priv)
267 {
268 	struct gve_ring_com *com;
269 	struct gve_tx_ring *tx;
270 	int err;
271 	int i;
272 
273 	if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK))
274 		return (0);
275 
276 	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
277 		if (gve_is_gqi(priv))
278 			gve_clear_tx_ring(priv, i);
279 		else
280 			gve_clear_tx_ring_dqo(priv, i);
281 	}
282 
283 	err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues);
284 	if (err != 0)
285 		return (err);
286 
287 	bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map,
288 	    BUS_DMASYNC_POSTREAD);
289 
290 	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
291 		tx = &priv->tx[i];
292 		com = &tx->com;
293 
294 		com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index);
295 
296 		bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map,
297 		    BUS_DMASYNC_POSTREAD);
298 		com->db_offset = 4 * be32toh(com->q_resources->db_index);
299 		com->counter_idx = be32toh(com->q_resources->counter_index);
300 
301 		gve_start_tx_ring(priv, i);
302 	}
303 
304 	gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
305 	return (0);
306 }
307 
308 static void
gve_stop_tx_ring(struct gve_priv * priv,int i)309 gve_stop_tx_ring(struct gve_priv *priv, int i)
310 {
311 	struct gve_tx_ring *tx = &priv->tx[i];
312 	struct gve_ring_com *com = &tx->com;
313 
314 	if (com->cleanup_tq != NULL) {
315 		taskqueue_quiesce(com->cleanup_tq);
316 		taskqueue_free(com->cleanup_tq);
317 		com->cleanup_tq = NULL;
318 	}
319 
320 	if (tx->xmit_tq != NULL) {
321 		taskqueue_quiesce(tx->xmit_tq);
322 		taskqueue_free(tx->xmit_tq);
323 		tx->xmit_tq = NULL;
324 	}
325 }
326 
327 int
gve_destroy_tx_rings(struct gve_priv * priv)328 gve_destroy_tx_rings(struct gve_priv *priv)
329 {
330 	int err;
331 	int i;
332 
333 	for (i = 0; i < priv->tx_cfg.num_queues; i++)
334 		gve_stop_tx_ring(priv, i);
335 
336 	if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) {
337 		err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues);
338 		if (err != 0)
339 			return (err);
340 		gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
341 	}
342 
343 	return (0);
344 }
345 
346 int
gve_tx_intr(void * arg)347 gve_tx_intr(void *arg)
348 {
349 	struct gve_tx_ring *tx = arg;
350 	struct gve_priv *priv = tx->com.priv;
351 	struct gve_ring_com *com = &tx->com;
352 
353 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
354 		return (FILTER_STRAY);
355 
356 	gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK);
357 	taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
358 	return (FILTER_HANDLED);
359 }
360 
361 static uint32_t
gve_tx_load_event_counter(struct gve_priv * priv,struct gve_tx_ring * tx)362 gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx)
363 {
364 	bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map,
365 	    BUS_DMASYNC_POSTREAD);
366 	uint32_t counter = priv->counters[tx->com.counter_idx];
367 	return (be32toh(counter));
368 }
369 
370 static void
gve_tx_free_fifo(struct gve_tx_fifo * fifo,size_t bytes)371 gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes)
372 {
373 	atomic_add_int(&fifo->available, bytes);
374 }
375 
376 void
gve_tx_cleanup_tq(void * arg,int pending)377 gve_tx_cleanup_tq(void *arg, int pending)
378 {
379 	struct gve_tx_ring *tx = arg;
380 	struct gve_priv *priv = tx->com.priv;
381 	uint32_t nic_done = gve_tx_load_event_counter(priv, tx);
382 	uint32_t todo = nic_done - tx->done;
383 	size_t space_freed = 0;
384 	int i, j;
385 
386 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
387 		return;
388 
389 	for (j = 0; j < todo; j++) {
390 		uint32_t idx = tx->done & tx->mask;
391 		struct gve_tx_buffer_state *info = &tx->info[idx];
392 		struct mbuf *mbuf = info->mbuf;
393 
394 		tx->done++;
395 		if (mbuf == NULL)
396 			continue;
397 
398 		info->mbuf = NULL;
399 		counter_enter();
400 		counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len);
401 		counter_u64_add_protected(tx->stats.tpackets, 1);
402 		counter_exit();
403 		m_freem(mbuf);
404 
405 		for (i = 0; i < GVE_TX_MAX_DESCS; i++) {
406 			space_freed += info->iov[i].iov_len + info->iov[i].iov_padding;
407 			info->iov[i].iov_len = 0;
408 			info->iov[i].iov_padding = 0;
409 		}
410 	}
411 
412 	gve_tx_free_fifo(&tx->fifo, space_freed);
413 
414 	gve_db_bar_write_4(priv, tx->com.irq_db_offset,
415 	    GVE_IRQ_ACK | GVE_IRQ_EVENT);
416 
417 	/*
418 	 * Completions born before this barrier MAY NOT cause the NIC to send an
419 	 * interrupt but they will still be handled by the enqueue below.
420 	 * Completions born after the barrier WILL trigger an interrupt.
421 	 */
422 	atomic_thread_fence_seq_cst();
423 
424 	nic_done = gve_tx_load_event_counter(priv, tx);
425 	todo = nic_done - tx->done;
426 	if (todo != 0) {
427 		gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK);
428 		taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
429 	}
430 
431 	if (atomic_load_bool(&tx->stopped) && space_freed) {
432 		atomic_store_bool(&tx->stopped, false);
433 		taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
434 	}
435 }
436 
437 static void
gve_dma_sync_for_device(struct gve_queue_page_list * qpl,uint64_t iov_offset,uint64_t iov_len)438 gve_dma_sync_for_device(struct gve_queue_page_list *qpl,
439 			uint64_t iov_offset, uint64_t iov_len)
440 {
441 	uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;
442 	uint64_t first_page = iov_offset / PAGE_SIZE;
443 	struct gve_dma_handle *dma;
444 	uint64_t page;
445 
446 	for (page = first_page; page <= last_page; page++) {
447 		dma = &(qpl->dmas[page]);
448 		bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE);
449 	}
450 }
451 
452 static void
gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc * mtd_desc,struct mbuf * mbuf)453 gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf)
454 {
455 	mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH;
456 	mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4;
457 	mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid);
458 	mtd_desc->reserved0 = 0;
459 	mtd_desc->reserved1 = 0;
460 }
461 
462 static void
gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc * pkt_desc,bool is_tso,uint16_t l4_hdr_offset,uint32_t desc_cnt,uint16_t first_seg_len,uint64_t addr,bool has_csum_flag,int csum_offset,uint16_t pkt_len)463 gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso,
464     uint16_t l4_hdr_offset, uint32_t desc_cnt,
465     uint16_t first_seg_len, uint64_t addr, bool has_csum_flag,
466     int csum_offset, uint16_t pkt_len)
467 {
468 	if (is_tso) {
469 		pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM;
470 		pkt_desc->l4_csum_offset = csum_offset >> 1;
471 		pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
472 	} else if (has_csum_flag) {
473 		pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM;
474 		pkt_desc->l4_csum_offset = csum_offset >> 1;
475 		pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
476 	} else {
477 		pkt_desc->type_flags = GVE_TXD_STD;
478 		pkt_desc->l4_csum_offset = 0;
479 		pkt_desc->l4_hdr_offset = 0;
480 	}
481 	pkt_desc->desc_cnt = desc_cnt;
482 	pkt_desc->len = htobe16(pkt_len);
483 	pkt_desc->seg_len = htobe16(first_seg_len);
484 	pkt_desc->seg_addr = htobe64(addr);
485 }
486 
487 static void
gve_tx_fill_seg_desc(struct gve_tx_seg_desc * seg_desc,bool is_tso,uint16_t len,uint64_t addr,bool is_ipv6,uint8_t l3_off,uint16_t tso_mss)488 gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc,
489     bool is_tso, uint16_t len, uint64_t addr,
490     bool is_ipv6, uint8_t l3_off, uint16_t tso_mss)
491 {
492 	seg_desc->type_flags = GVE_TXD_SEG;
493 	if (is_tso) {
494 		if (is_ipv6)
495 			seg_desc->type_flags |= GVE_TXSF_IPV6;
496 		seg_desc->l3_offset = l3_off >> 1;
497 		seg_desc->mss = htobe16(tso_mss);
498 	}
499 	seg_desc->seg_len = htobe16(len);
500 	seg_desc->seg_addr = htobe64(addr);
501 }
502 
503 static inline uint32_t
gve_tx_avail(struct gve_tx_ring * tx)504 gve_tx_avail(struct gve_tx_ring *tx)
505 {
506 	return (tx->mask + 1 - (tx->req - tx->done));
507 }
508 
509 static bool
gve_tx_fifo_can_alloc(struct gve_tx_fifo * fifo,size_t bytes)510 gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes)
511 {
512 	return (atomic_load_int(&fifo->available) >= bytes);
513 }
514 
515 static inline bool
gve_can_tx(struct gve_tx_ring * tx,int bytes_required)516 gve_can_tx(struct gve_tx_ring *tx, int bytes_required)
517 {
518 	return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) &&
519 	    gve_tx_fifo_can_alloc(&tx->fifo, bytes_required));
520 }
521 
522 static int
gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo * fifo,size_t bytes)523 gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes)
524 {
525 	return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head;
526 }
527 
528 static inline int
gve_fifo_bytes_required(struct gve_tx_ring * tx,uint16_t first_seg_len,uint16_t pkt_len)529 gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len,
530     uint16_t pkt_len)
531 {
532 	int pad_bytes, align_hdr_pad;
533 	int bytes;
534 
535 	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
536 	/* We need to take into account the header alignment padding. */
537 	align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len;
538 	bytes = align_hdr_pad + pad_bytes + pkt_len;
539 
540 	return (bytes);
541 }
542 
543 static int
gve_tx_alloc_fifo(struct gve_tx_fifo * fifo,size_t bytes,struct gve_tx_iovec iov[2])544 gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes,
545     struct gve_tx_iovec iov[2])
546 {
547 	size_t overflow, padding;
548 	uint32_t aligned_head;
549 	int nfrags = 0;
550 
551 	if (bytes == 0)
552 		return (0);
553 
554 	/*
555 	 * This check happens before we know how much padding is needed to
556 	 * align to a cacheline boundary for the payload, but that is fine,
557 	 * because the FIFO head always start aligned, and the FIFO's boundaries
558 	 * are aligned, so if there is space for the data, there is space for
559 	 * the padding to the next alignment.
560 	 */
561 	KASSERT(gve_tx_fifo_can_alloc(fifo, bytes),
562 	    ("Allocating gve tx fifo when there is no room"));
563 
564 	nfrags++;
565 
566 	iov[0].iov_offset = fifo->head;
567 	iov[0].iov_len = bytes;
568 	fifo->head += bytes;
569 
570 	if (fifo->head > fifo->size) {
571 		/*
572 		 * If the allocation did not fit in the tail fragment of the
573 		 * FIFO, also use the head fragment.
574 		 */
575 		nfrags++;
576 		overflow = fifo->head - fifo->size;
577 		iov[0].iov_len -= overflow;
578 		iov[1].iov_offset = 0;	/* Start of fifo*/
579 		iov[1].iov_len = overflow;
580 
581 		fifo->head = overflow;
582 	}
583 
584 	/* Re-align to a cacheline boundary */
585 	aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE);
586 	padding = aligned_head - fifo->head;
587 	iov[nfrags - 1].iov_padding = padding;
588 	atomic_add_int(&fifo->available, -(bytes + padding));
589 	fifo->head = aligned_head;
590 
591 	if (fifo->head == fifo->size)
592 		fifo->head = 0;
593 
594 	return (nfrags);
595 }
596 
597 /* Only error this returns is ENOBUFS when the tx fifo is short of space */
598 static int
gve_xmit(struct gve_tx_ring * tx,struct mbuf * mbuf)599 gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
600 {
601 	bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false;
602 	int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset;
603 	uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len;
604 	int pad_bytes, hdr_nfrags, payload_nfrags;
605 	struct gve_tx_pkt_desc *pkt_desc;
606 	struct gve_tx_seg_desc *seg_desc;
607 	struct gve_tx_mtd_desc *mtd_desc;
608 	struct gve_tx_buffer_state *info;
609 	uint32_t idx = tx->req & tx->mask;
610 	struct ether_header *eh;
611 	struct mbuf *mbuf_next;
612 	int payload_iov = 2;
613 	int bytes_required;
614 	struct ip6_hdr *ip6;
615 	struct tcphdr *th;
616 	uint32_t next_idx;
617 	uint8_t l3_off;
618 	struct ip *ip;
619 	int i;
620 
621 	info = &tx->info[idx];
622 	csum_flags = mbuf->m_pkthdr.csum_flags;
623 	pkt_len = mbuf->m_pkthdr.len;
624 	is_tso = csum_flags & CSUM_TSO;
625 	has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
626 	    CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
627 	mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0;
628 	tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0;
629 
630 	eh = mtod(mbuf, struct ether_header *);
631 	KASSERT(eh->ether_type != ETHERTYPE_VLAN,
632 	    ("VLAN-tagged packets not supported"));
633 
634 	is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6;
635 	l3_off = ETHER_HDR_LEN;
636 	mbuf_next = m_getptr(mbuf, l3_off, &offset);
637 
638 	if (is_ipv6) {
639 		ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset));
640 		l4_off = l3_off + sizeof(struct ip6_hdr);
641 		is_tcp = (ip6->ip6_nxt == IPPROTO_TCP);
642 		is_udp = (ip6->ip6_nxt == IPPROTO_UDP);
643 		mbuf_next = m_getptr(mbuf, l4_off, &offset);
644 	} else if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
645 		ip = (struct ip *)(mtodo(mbuf_next, offset));
646 		l4_off = l3_off + (ip->ip_hl << 2);
647 		is_tcp = (ip->ip_p == IPPROTO_TCP);
648 		is_udp = (ip->ip_p == IPPROTO_UDP);
649 		mbuf_next = m_getptr(mbuf, l4_off, &offset);
650 	}
651 
652 	l4_data_off = 0;
653 	if (is_tcp) {
654 		th = (struct tcphdr *)(mtodo(mbuf_next, offset));
655 		l4_data_off = l4_off + (th->th_off << 2);
656 	} else if (is_udp)
657 		l4_data_off = l4_off + sizeof(struct udphdr);
658 
659 	if (has_csum_flag) {
660 		if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0)
661 			csum_offset = offsetof(struct tcphdr, th_sum);
662 		else
663 			csum_offset = offsetof(struct udphdr, uh_sum);
664 	}
665 
666 	/*
667 	 * If this packet is neither a TCP nor a UDP packet, the first segment,
668 	 * the one represented by the packet descriptor, will carry the
669 	 * spec-stipulated minimum of 182B.
670 	 */
671 	if (l4_data_off != 0)
672 		first_seg_len = l4_data_off;
673 	else
674 		first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES);
675 
676 	bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len);
677 	if (__predict_false(!gve_can_tx(tx, bytes_required))) {
678 		counter_enter();
679 		counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1);
680 		counter_exit();
681 		return (ENOBUFS);
682 	}
683 
684 	/* So that the cleanup taskqueue can free the mbuf eventually. */
685 	info->mbuf = mbuf;
686 
687 	/*
688 	 * We don't want to split the header, so if necessary, pad to the end
689 	 * of the fifo and then put the header at the beginning of the fifo.
690 	 */
691 	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
692 	hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes,
693 	    &info->iov[0]);
694 	KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0"));
695 	payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len,
696 	    &info->iov[payload_iov]);
697 
698 	pkt_desc = &tx->desc_ring[idx].pkt;
699 	gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off,
700 	    1 + mtd_desc_nr + payload_nfrags, first_seg_len,
701 	    info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset,
702 	    pkt_len);
703 
704 	m_copydata(mbuf, 0, first_seg_len,
705 	    (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset);
706 	gve_dma_sync_for_device(tx->com.qpl,
707 	    info->iov[hdr_nfrags - 1].iov_offset,
708 	    info->iov[hdr_nfrags - 1].iov_len);
709 	copy_offset = first_seg_len;
710 
711 	if (mtd_desc_nr == 1) {
712 		next_idx = (tx->req + 1) & tx->mask;
713 		mtd_desc = &tx->desc_ring[next_idx].mtd;
714 		gve_tx_fill_mtd_desc(mtd_desc, mbuf);
715 	}
716 
717 	for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
718 		next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;
719 		seg_desc = &tx->desc_ring[next_idx].seg;
720 
721 		gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len,
722 		    info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss);
723 
724 		m_copydata(mbuf, copy_offset, info->iov[i].iov_len,
725 		    (char *)tx->fifo.base + info->iov[i].iov_offset);
726 		gve_dma_sync_for_device(tx->com.qpl,
727 		    info->iov[i].iov_offset, info->iov[i].iov_len);
728 		copy_offset += info->iov[i].iov_len;
729 	}
730 
731 	tx->req += (1 + mtd_desc_nr + payload_nfrags);
732 	if (is_tso) {
733 		counter_enter();
734 		counter_u64_add_protected(tx->stats.tso_packet_cnt, 1);
735 		counter_exit();
736 	}
737 	return (0);
738 }
739 
740 static int
gve_xmit_mbuf(struct gve_tx_ring * tx,struct mbuf ** mbuf)741 gve_xmit_mbuf(struct gve_tx_ring *tx,
742     struct mbuf **mbuf)
743 {
744 	if (gve_is_gqi(tx->com.priv))
745 		return (gve_xmit(tx, *mbuf));
746 
747 	if (gve_is_qpl(tx->com.priv))
748 		return (gve_xmit_dqo_qpl(tx, *mbuf));
749 
750 	/*
751 	 * gve_xmit_dqo might attempt to defrag the mbuf chain.
752 	 * The reference is passed in so that in the case of
753 	 * errors, the new mbuf chain is what's put back on the br.
754 	 */
755 	return (gve_xmit_dqo(tx, mbuf));
756 }
757 
758 /*
759  * Has the side-effect of stopping the xmit queue by setting tx->stopped
760  */
761 static int
gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring * tx,struct mbuf ** mbuf)762 gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx,
763     struct mbuf **mbuf)
764 {
765 	int err;
766 
767 	atomic_store_bool(&tx->stopped, true);
768 
769 	/*
770 	 * Room made in the queue BEFORE the barrier will be seen by the
771 	 * gve_xmit_mbuf retry below.
772 	 *
773 	 * If room is made in the queue AFTER the barrier, the cleanup tq
774 	 * iteration creating the room will either see a tx->stopped value
775 	 * of 0 or the 1 we just wrote:
776 	 *
777 	 *   If it sees a 1, then it would enqueue the xmit tq. Enqueue
778 	 *   implies a retry on the waiting pkt.
779 	 *
780 	 *   If it sees a 0, then that implies a previous iteration overwrote
781 	 *   our 1, and that iteration would enqueue the xmit tq. Enqueue
782 	 *   implies a retry on the waiting pkt.
783 	 */
784 	atomic_thread_fence_seq_cst();
785 
786 	err = gve_xmit_mbuf(tx, mbuf);
787 	if (err == 0)
788 		atomic_store_bool(&tx->stopped, false);
789 
790 	return (err);
791 }
792 
793 static void
gve_xmit_br(struct gve_tx_ring * tx)794 gve_xmit_br(struct gve_tx_ring *tx)
795 {
796 	struct gve_priv *priv = tx->com.priv;
797 	struct ifnet *ifp = priv->ifp;
798 	struct mbuf *mbuf;
799 	int err;
800 
801 	while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 &&
802 	    (mbuf = drbr_peek(ifp, tx->br)) != NULL) {
803 		err = gve_xmit_mbuf(tx, &mbuf);
804 
805 		/*
806 		 * We need to stop this taskqueue when we can't xmit the pkt due
807 		 * to lack of space in the NIC ring (ENOBUFS). The retry exists
808 		 * to guard against a TOCTTOU bug that could end up freezing the
809 		 * queue forever.
810 		 */
811 		if (__predict_false(mbuf != NULL && err == ENOBUFS))
812 			err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf);
813 
814 		if (__predict_false(err != 0 && mbuf != NULL)) {
815 			if (err == EINVAL) {
816 				drbr_advance(ifp, tx->br);
817 				m_freem(mbuf);
818 			} else
819 				drbr_putback(ifp, tx->br, mbuf);
820 			break;
821 		}
822 
823 		drbr_advance(ifp, tx->br);
824 		BPF_MTAP(ifp, mbuf);
825 
826 		bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
827 		    BUS_DMASYNC_PREWRITE);
828 
829 		if (gve_is_gqi(priv))
830 			gve_db_bar_write_4(priv, tx->com.db_offset, tx->req);
831 		else
832 			gve_db_bar_dqo_write_4(priv, tx->com.db_offset,
833 			    tx->dqo.desc_tail);
834 	}
835 }
836 
837 void
gve_xmit_tq(void * arg,int pending)838 gve_xmit_tq(void *arg, int pending)
839 {
840 	struct gve_tx_ring *tx = (struct gve_tx_ring *)arg;
841 
842 	GVE_RING_LOCK(tx);
843 	gve_xmit_br(tx);
844 	GVE_RING_UNLOCK(tx);
845 }
846 
847 static bool
is_vlan_tagged_pkt(struct mbuf * mbuf)848 is_vlan_tagged_pkt(struct mbuf *mbuf)
849 {
850 	struct ether_header *eh;
851 
852 	eh = mtod(mbuf, struct ether_header *);
853 	return (ntohs(eh->ether_type) == ETHERTYPE_VLAN);
854 }
855 
856 int
gve_xmit_ifp(if_t ifp,struct mbuf * mbuf)857 gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
858 {
859 	struct gve_priv *priv = if_getsoftc(ifp);
860 	struct gve_tx_ring *tx;
861 	bool is_br_empty;
862 	int err;
863 	uint32_t i;
864 
865 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
866 		return (ENODEV);
867 
868 	if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE)
869 		i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues;
870 	else
871 		i = curcpu % priv->tx_cfg.num_queues;
872 	tx = &priv->tx[i];
873 
874 	if (__predict_false(is_vlan_tagged_pkt(mbuf))) {
875 		counter_enter();
876 		counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1);
877 		counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
878 		counter_exit();
879 		m_freem(mbuf);
880 		return (ENODEV);
881 	}
882 
883 	is_br_empty = drbr_empty(ifp, tx->br);
884 	err = drbr_enqueue(ifp, tx->br, mbuf);
885 	if (__predict_false(err != 0)) {
886 		if (!atomic_load_bool(&tx->stopped))
887 			taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
888 		counter_enter();
889 		counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1);
890 		counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
891 		counter_exit();
892 		return (err);
893 	}
894 
895 	/*
896 	 * If the mbuf we just enqueued is the only one on the ring, then
897 	 * transmit it right away in the interests of low latency.
898 	 */
899 	if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) {
900 		gve_xmit_br(tx);
901 		GVE_RING_UNLOCK(tx);
902 	} else if (!atomic_load_bool(&tx->stopped))
903 		taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
904 
905 	return (0);
906 }
907 
908 void
gve_qflush(if_t ifp)909 gve_qflush(if_t ifp)
910 {
911 	struct gve_priv *priv = if_getsoftc(ifp);
912 	struct gve_tx_ring *tx;
913 	int i;
914 
915 	for (i = 0; i < priv->tx_cfg.num_queues; ++i) {
916 		tx = &priv->tx[i];
917 		if (drbr_empty(ifp, tx->br) == 0) {
918 			GVE_RING_LOCK(tx);
919 			drbr_flush(ifp, tx->br);
920 			GVE_RING_UNLOCK(tx);
921 		}
922 	}
923 
924 	if_qflush(ifp);
925 }
926