xref: /freebsd/sys/dev/gve/gve_tx.c (revision 3d2957336c7ddaa0a29cf60cfd458c07df1f5be9)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2023-2024 Google LLC
5  *
6  * Redistribution and use in source and binary forms, with or without modification,
7  * are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * 3. Neither the name of the copyright holder nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software without
18  *    specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
24  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
27  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 #include "gve.h"
32 #include "gve_adminq.h"
33 #include "gve_dqo.h"
34 
35 #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182
36 
37 static int
gve_tx_fifo_init(struct gve_priv * priv,struct gve_tx_ring * tx)38 gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx)
39 {
40 	struct gve_queue_page_list *qpl = tx->com.qpl;
41 	struct gve_tx_fifo *fifo = &tx->fifo;
42 
43 	fifo->size = qpl->num_pages * PAGE_SIZE;
44 	fifo->base = qpl->kva;
45 	atomic_store_int(&fifo->available, fifo->size);
46 	fifo->head = 0;
47 
48 	return (0);
49 }
50 
51 static void
gve_tx_free_ring_gqi(struct gve_priv * priv,int i)52 gve_tx_free_ring_gqi(struct gve_priv *priv, int i)
53 {
54 	struct gve_tx_ring *tx = &priv->tx[i];
55 	struct gve_ring_com *com = &tx->com;
56 
57 	if (tx->desc_ring != NULL) {
58 		gve_dma_free_coherent(&tx->desc_ring_mem);
59 		tx->desc_ring = NULL;
60 	}
61 
62 	if (tx->info != NULL) {
63 		free(tx->info, M_GVE);
64 		tx->info = NULL;
65 	}
66 
67 	if (com->qpl != NULL) {
68 		gve_free_qpl(priv, com->qpl);
69 		com->qpl = NULL;
70 	}
71 }
72 
73 static void
gve_tx_free_ring(struct gve_priv * priv,int i)74 gve_tx_free_ring(struct gve_priv *priv, int i)
75 {
76 	struct gve_tx_ring *tx = &priv->tx[i];
77 	struct gve_ring_com *com = &tx->com;
78 
79 	/* Safe to call even if never alloced */
80 	gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
81 
82 	if (mtx_initialized(&tx->ring_mtx))
83 		mtx_destroy(&tx->ring_mtx);
84 
85 	if (com->q_resources != NULL) {
86 		gve_dma_free_coherent(&com->q_resources_mem);
87 		com->q_resources = NULL;
88 	}
89 
90 	if (tx->br != NULL) {
91 		buf_ring_free(tx->br, M_DEVBUF);
92 		tx->br = NULL;
93 	}
94 
95 	if (gve_is_gqi(priv))
96 		gve_tx_free_ring_gqi(priv, i);
97 	else
98 		gve_tx_free_ring_dqo(priv, i);
99 }
100 
101 static int
gve_tx_alloc_ring_gqi(struct gve_priv * priv,int i)102 gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i)
103 {
104 	struct gve_tx_ring *tx = &priv->tx[i];
105 	struct gve_ring_com *com = &tx->com;
106 	int err;
107 
108 	err = gve_dma_alloc_coherent(priv,
109 	    sizeof(union gve_tx_desc) * priv->tx_desc_cnt,
110 	    CACHE_LINE_SIZE, &tx->desc_ring_mem);
111 	if (err != 0) {
112 		device_printf(priv->dev,
113 		    "Failed to alloc desc ring for tx ring %d", i);
114 		goto abort;
115 	}
116 	tx->desc_ring = tx->desc_ring_mem.cpu_addr;
117 
118 	com->qpl = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR,
119 	    /*single_kva=*/true);
120 	if (com->qpl == NULL) {
121 		device_printf(priv->dev,
122 		    "Failed to alloc QPL for tx ring %d\n", i);
123 		err = ENOMEM;
124 		goto abort;
125 	}
126 
127 	err = gve_tx_fifo_init(priv, tx);
128 	if (err != 0)
129 		goto abort;
130 
131 	tx->info = malloc(
132 	    sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt,
133 	    M_GVE, M_WAITOK | M_ZERO);
134 	return (0);
135 
136 abort:
137 	gve_tx_free_ring_gqi(priv, i);
138 	return (err);
139 }
140 
141 static int
gve_tx_alloc_ring(struct gve_priv * priv,int i)142 gve_tx_alloc_ring(struct gve_priv *priv, int i)
143 {
144 	struct gve_tx_ring *tx = &priv->tx[i];
145 	struct gve_ring_com *com = &tx->com;
146 	char mtx_name[16];
147 	int err;
148 
149 	com->priv = priv;
150 	com->id = i;
151 
152 	if (gve_is_gqi(priv))
153 		err = gve_tx_alloc_ring_gqi(priv, i);
154 	else
155 		err = gve_tx_alloc_ring_dqo(priv, i);
156 	if (err != 0)
157 		goto abort;
158 
159 	sprintf(mtx_name, "gvetx%d", i);
160 	mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF);
161 
162 	tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF,
163 	    M_WAITOK, &tx->ring_mtx);
164 
165 	gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
166 
167 	err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources),
168 	    PAGE_SIZE, &com->q_resources_mem);
169 	if (err != 0) {
170 		device_printf(priv->dev,
171 		    "Failed to alloc queue resources for tx ring %d", i);
172 		goto abort;
173 	}
174 	com->q_resources = com->q_resources_mem.cpu_addr;
175 
176 	tx->last_kicked = 0;
177 
178 	return (0);
179 
180 abort:
181 	gve_tx_free_ring(priv, i);
182 	return (err);
183 }
184 
185 int
gve_alloc_tx_rings(struct gve_priv * priv,uint16_t start_idx,uint16_t stop_idx)186 gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx)
187 {
188 	int i;
189 	int err;
190 
191 	KASSERT(priv->tx != NULL, ("priv->tx is NULL!"));
192 
193 	for (i = start_idx; i < stop_idx; i++) {
194 		err = gve_tx_alloc_ring(priv, i);
195 		if (err != 0)
196 			goto free_rings;
197 	}
198 
199 	return (0);
200 free_rings:
201 	gve_free_tx_rings(priv, start_idx, i);
202 	return (err);
203 }
204 
205 void
gve_free_tx_rings(struct gve_priv * priv,uint16_t start_idx,uint16_t stop_idx)206 gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx)
207 {
208 	int i;
209 
210 	for (i = start_idx; i < stop_idx; i++)
211 		gve_tx_free_ring(priv, i);
212 }
213 
214 static void
gve_tx_clear_desc_ring(struct gve_tx_ring * tx)215 gve_tx_clear_desc_ring(struct gve_tx_ring *tx)
216 {
217 	struct gve_ring_com *com = &tx->com;
218 	int i;
219 
220 	for (i = 0; i < com->priv->tx_desc_cnt; i++) {
221 		tx->desc_ring[i] = (union gve_tx_desc){};
222 		tx->info[i] = (struct gve_tx_buffer_state){};
223 		gve_invalidate_timestamp(&tx->info[i].enqueue_time_sec);
224 	}
225 
226 	bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
227 	    BUS_DMASYNC_PREWRITE);
228 }
229 
230 static void
gve_clear_tx_ring(struct gve_priv * priv,int i)231 gve_clear_tx_ring(struct gve_priv *priv, int i)
232 {
233 	struct gve_tx_ring *tx = &priv->tx[i];
234 	struct gve_tx_fifo *fifo = &tx->fifo;
235 
236 	tx->req = 0;
237 	tx->done = 0;
238 	tx->mask = priv->tx_desc_cnt - 1;
239 
240 	atomic_store_int(&fifo->available, fifo->size);
241 	fifo->head = 0;
242 
243 	gve_tx_clear_desc_ring(tx);
244 }
245 
246 static void
gve_start_tx_ring(struct gve_priv * priv,int i)247 gve_start_tx_ring(struct gve_priv *priv, int i)
248 {
249 	struct gve_tx_ring *tx = &priv->tx[i];
250 	struct gve_ring_com *com = &tx->com;
251 
252 	atomic_store_bool(&tx->stopped, false);
253 	if (gve_is_gqi(priv))
254 		NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx);
255 	else
256 		NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx);
257 	com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK,
258 	    taskqueue_thread_enqueue, &com->cleanup_tq);
259 	taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d",
260 	    device_get_nameunit(priv->dev), i);
261 
262 	TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx);
263 	tx->xmit_tq = taskqueue_create_fast("gve tx xmit",
264 	    M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq);
265 	taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit",
266 	    device_get_nameunit(priv->dev), i);
267 }
268 
269 int
gve_create_tx_rings(struct gve_priv * priv)270 gve_create_tx_rings(struct gve_priv *priv)
271 {
272 	struct gve_ring_com *com;
273 	struct gve_tx_ring *tx;
274 	int err;
275 	int i;
276 
277 	if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK))
278 		return (0);
279 
280 	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
281 		if (gve_is_gqi(priv))
282 			gve_clear_tx_ring(priv, i);
283 		else
284 			gve_clear_tx_ring_dqo(priv, i);
285 	}
286 
287 	err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues);
288 	if (err != 0)
289 		return (err);
290 
291 	bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map,
292 	    BUS_DMASYNC_POSTREAD);
293 
294 	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
295 		tx = &priv->tx[i];
296 		com = &tx->com;
297 
298 		com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index);
299 
300 		bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map,
301 		    BUS_DMASYNC_POSTREAD);
302 		com->db_offset = 4 * be32toh(com->q_resources->db_index);
303 		com->counter_idx = be32toh(com->q_resources->counter_index);
304 
305 		gve_start_tx_ring(priv, i);
306 	}
307 
308 	gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
309 	return (0);
310 }
311 
312 static void
gve_stop_tx_ring(struct gve_priv * priv,int i)313 gve_stop_tx_ring(struct gve_priv *priv, int i)
314 {
315 	struct gve_tx_ring *tx = &priv->tx[i];
316 	struct gve_ring_com *com = &tx->com;
317 
318 	if (com->cleanup_tq != NULL) {
319 		taskqueue_quiesce(com->cleanup_tq);
320 		taskqueue_free(com->cleanup_tq);
321 		com->cleanup_tq = NULL;
322 	}
323 
324 	if (tx->xmit_tq != NULL) {
325 		taskqueue_quiesce(tx->xmit_tq);
326 		taskqueue_free(tx->xmit_tq);
327 		tx->xmit_tq = NULL;
328 	}
329 }
330 
331 int
gve_destroy_tx_rings(struct gve_priv * priv)332 gve_destroy_tx_rings(struct gve_priv *priv)
333 {
334 	int err;
335 	int i;
336 
337 	for (i = 0; i < priv->tx_cfg.num_queues; i++)
338 		gve_stop_tx_ring(priv, i);
339 
340 	if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) {
341 		err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues);
342 		if (err != 0)
343 			return (err);
344 		gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
345 	}
346 
347 	return (0);
348 }
349 
350 int
gve_check_tx_timeout_gqi(struct gve_priv * priv,struct gve_tx_ring * tx)351 gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx)
352 {
353 	struct gve_tx_buffer_state *info;
354 	uint32_t pkt_idx;
355 	int num_timeouts;
356 
357 	num_timeouts = 0;
358 
359 	for (pkt_idx = 0; pkt_idx < priv->tx_desc_cnt; pkt_idx++) {
360 		info = &tx->info[pkt_idx];
361 
362 		if (!gve_timestamp_valid(&info->enqueue_time_sec))
363 			continue;
364 
365 		if (__predict_false(
366 		    gve_seconds_since(&info->enqueue_time_sec) >
367 		    GVE_TX_TIMEOUT_PKT_SEC))
368 			num_timeouts += 1;
369 	}
370 
371 	return (num_timeouts);
372 }
373 
374 int
gve_tx_intr(void * arg)375 gve_tx_intr(void *arg)
376 {
377 	struct gve_tx_ring *tx = arg;
378 	struct gve_priv *priv = tx->com.priv;
379 	struct gve_ring_com *com = &tx->com;
380 
381 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
382 		return (FILTER_STRAY);
383 
384 	gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK);
385 	taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
386 	return (FILTER_HANDLED);
387 }
388 
389 static uint32_t
gve_tx_load_event_counter(struct gve_priv * priv,struct gve_tx_ring * tx)390 gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx)
391 {
392 	bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map,
393 	    BUS_DMASYNC_POSTREAD);
394 	uint32_t counter = priv->counters[tx->com.counter_idx];
395 	return (be32toh(counter));
396 }
397 
398 static void
gve_tx_free_fifo(struct gve_tx_fifo * fifo,size_t bytes)399 gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes)
400 {
401 	atomic_add_int(&fifo->available, bytes);
402 }
403 
404 void
gve_tx_cleanup_tq(void * arg,int pending)405 gve_tx_cleanup_tq(void *arg, int pending)
406 {
407 	struct gve_tx_ring *tx = arg;
408 	struct gve_priv *priv = tx->com.priv;
409 	uint32_t nic_done = gve_tx_load_event_counter(priv, tx);
410 	uint32_t todo = nic_done - tx->done;
411 	size_t space_freed = 0;
412 	int i, j;
413 
414 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
415 		return;
416 
417 	for (j = 0; j < todo; j++) {
418 		uint32_t idx = tx->done & tx->mask;
419 		struct gve_tx_buffer_state *info = &tx->info[idx];
420 		struct mbuf *mbuf = info->mbuf;
421 
422 		tx->done++;
423 		if (mbuf == NULL)
424 			continue;
425 
426 		gve_invalidate_timestamp(&info->enqueue_time_sec);
427 
428 		info->mbuf = NULL;
429 
430 		counter_enter();
431 		counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len);
432 		counter_u64_add_protected(tx->stats.tpackets, 1);
433 		counter_exit();
434 		m_freem(mbuf);
435 
436 		for (i = 0; i < GVE_TX_MAX_DESCS; i++) {
437 			space_freed += info->iov[i].iov_len + info->iov[i].iov_padding;
438 			info->iov[i].iov_len = 0;
439 			info->iov[i].iov_padding = 0;
440 		}
441 	}
442 
443 	gve_tx_free_fifo(&tx->fifo, space_freed);
444 
445 	gve_db_bar_write_4(priv, tx->com.irq_db_offset,
446 	    GVE_IRQ_ACK | GVE_IRQ_EVENT);
447 
448 	/*
449 	 * Completions born before this barrier MAY NOT cause the NIC to send an
450 	 * interrupt but they will still be handled by the enqueue below.
451 	 * Completions born after the barrier WILL trigger an interrupt.
452 	 */
453 	atomic_thread_fence_seq_cst();
454 
455 	nic_done = gve_tx_load_event_counter(priv, tx);
456 	todo = nic_done - tx->done;
457 	if (todo != 0) {
458 		gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK);
459 		taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
460 	}
461 
462 	if (atomic_load_bool(&tx->stopped) && space_freed) {
463 		atomic_store_bool(&tx->stopped, false);
464 		taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
465 	}
466 }
467 
468 static void
gve_dma_sync_for_device(struct gve_queue_page_list * qpl,uint64_t iov_offset,uint64_t iov_len)469 gve_dma_sync_for_device(struct gve_queue_page_list *qpl,
470 			uint64_t iov_offset, uint64_t iov_len)
471 {
472 	uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;
473 	uint64_t first_page = iov_offset / PAGE_SIZE;
474 	struct gve_dma_handle *dma;
475 	uint64_t page;
476 
477 	for (page = first_page; page <= last_page; page++) {
478 		dma = &(qpl->dmas[page]);
479 		bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE);
480 	}
481 }
482 
483 static void
gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc * mtd_desc,struct mbuf * mbuf)484 gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf)
485 {
486 	mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH;
487 	mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4;
488 	mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid);
489 	mtd_desc->reserved0 = 0;
490 	mtd_desc->reserved1 = 0;
491 }
492 
493 static void
gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc * pkt_desc,bool is_tso,uint16_t l4_hdr_offset,uint32_t desc_cnt,uint16_t first_seg_len,uint64_t addr,bool has_csum_flag,int csum_offset,uint16_t pkt_len)494 gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso,
495     uint16_t l4_hdr_offset, uint32_t desc_cnt,
496     uint16_t first_seg_len, uint64_t addr, bool has_csum_flag,
497     int csum_offset, uint16_t pkt_len)
498 {
499 	if (is_tso) {
500 		pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM;
501 		pkt_desc->l4_csum_offset = csum_offset >> 1;
502 		pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
503 	} else if (has_csum_flag) {
504 		pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM;
505 		pkt_desc->l4_csum_offset = csum_offset >> 1;
506 		pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
507 	} else {
508 		pkt_desc->type_flags = GVE_TXD_STD;
509 		pkt_desc->l4_csum_offset = 0;
510 		pkt_desc->l4_hdr_offset = 0;
511 	}
512 	pkt_desc->desc_cnt = desc_cnt;
513 	pkt_desc->len = htobe16(pkt_len);
514 	pkt_desc->seg_len = htobe16(first_seg_len);
515 	pkt_desc->seg_addr = htobe64(addr);
516 }
517 
518 static void
gve_tx_fill_seg_desc(struct gve_tx_seg_desc * seg_desc,bool is_tso,uint16_t len,uint64_t addr,bool is_ipv6,uint8_t l3_off,uint16_t tso_mss)519 gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc,
520     bool is_tso, uint16_t len, uint64_t addr,
521     bool is_ipv6, uint8_t l3_off, uint16_t tso_mss)
522 {
523 	seg_desc->type_flags = GVE_TXD_SEG;
524 	if (is_tso) {
525 		if (is_ipv6)
526 			seg_desc->type_flags |= GVE_TXSF_IPV6;
527 		seg_desc->l3_offset = l3_off >> 1;
528 		seg_desc->mss = htobe16(tso_mss);
529 	}
530 	seg_desc->seg_len = htobe16(len);
531 	seg_desc->seg_addr = htobe64(addr);
532 }
533 
534 static inline uint32_t
gve_tx_avail(struct gve_tx_ring * tx)535 gve_tx_avail(struct gve_tx_ring *tx)
536 {
537 	return (tx->mask + 1 - (tx->req - tx->done));
538 }
539 
540 static bool
gve_tx_fifo_can_alloc(struct gve_tx_fifo * fifo,size_t bytes)541 gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes)
542 {
543 	return (atomic_load_int(&fifo->available) >= bytes);
544 }
545 
546 static inline bool
gve_can_tx(struct gve_tx_ring * tx,int bytes_required)547 gve_can_tx(struct gve_tx_ring *tx, int bytes_required)
548 {
549 	return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) &&
550 	    gve_tx_fifo_can_alloc(&tx->fifo, bytes_required));
551 }
552 
553 static int
gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo * fifo,size_t bytes)554 gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes)
555 {
556 	return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head;
557 }
558 
559 static inline int
gve_fifo_bytes_required(struct gve_tx_ring * tx,uint16_t first_seg_len,uint16_t pkt_len)560 gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len,
561     uint16_t pkt_len)
562 {
563 	int pad_bytes, align_hdr_pad;
564 	int bytes;
565 
566 	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
567 	/* We need to take into account the header alignment padding. */
568 	align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len;
569 	bytes = align_hdr_pad + pad_bytes + pkt_len;
570 
571 	return (bytes);
572 }
573 
574 static int
gve_tx_alloc_fifo(struct gve_tx_fifo * fifo,size_t bytes,struct gve_tx_iovec iov[2])575 gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes,
576     struct gve_tx_iovec iov[2])
577 {
578 	size_t overflow, padding;
579 	uint32_t aligned_head;
580 	int nfrags = 0;
581 
582 	if (bytes == 0)
583 		return (0);
584 
585 	/*
586 	 * This check happens before we know how much padding is needed to
587 	 * align to a cacheline boundary for the payload, but that is fine,
588 	 * because the FIFO head always start aligned, and the FIFO's boundaries
589 	 * are aligned, so if there is space for the data, there is space for
590 	 * the padding to the next alignment.
591 	 */
592 	KASSERT(gve_tx_fifo_can_alloc(fifo, bytes),
593 	    ("Allocating gve tx fifo when there is no room"));
594 
595 	nfrags++;
596 
597 	iov[0].iov_offset = fifo->head;
598 	iov[0].iov_len = bytes;
599 	fifo->head += bytes;
600 
601 	if (fifo->head > fifo->size) {
602 		/*
603 		 * If the allocation did not fit in the tail fragment of the
604 		 * FIFO, also use the head fragment.
605 		 */
606 		nfrags++;
607 		overflow = fifo->head - fifo->size;
608 		iov[0].iov_len -= overflow;
609 		iov[1].iov_offset = 0;	/* Start of fifo*/
610 		iov[1].iov_len = overflow;
611 
612 		fifo->head = overflow;
613 	}
614 
615 	/* Re-align to a cacheline boundary */
616 	aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE);
617 	padding = aligned_head - fifo->head;
618 	iov[nfrags - 1].iov_padding = padding;
619 	atomic_add_int(&fifo->available, -(bytes + padding));
620 	fifo->head = aligned_head;
621 
622 	if (fifo->head == fifo->size)
623 		fifo->head = 0;
624 
625 	return (nfrags);
626 }
627 
628 /* Only error this returns is ENOBUFS when the tx fifo is short of space */
629 static int
gve_xmit(struct gve_tx_ring * tx,struct mbuf * mbuf)630 gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
631 {
632 	bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false;
633 	int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset;
634 	uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len;
635 	int pad_bytes, hdr_nfrags, payload_nfrags;
636 	struct gve_tx_pkt_desc *pkt_desc;
637 	struct gve_tx_seg_desc *seg_desc;
638 	struct gve_tx_mtd_desc *mtd_desc;
639 	struct gve_tx_buffer_state *info;
640 	uint32_t idx = tx->req & tx->mask;
641 	struct ether_header *eh;
642 	struct mbuf *mbuf_next;
643 	int payload_iov = 2;
644 	int bytes_required;
645 	struct ip6_hdr *ip6;
646 	struct tcphdr *th;
647 	uint32_t next_idx;
648 	uint8_t l3_off;
649 	struct ip *ip;
650 	int i;
651 
652 	info = &tx->info[idx];
653 	csum_flags = mbuf->m_pkthdr.csum_flags;
654 	pkt_len = mbuf->m_pkthdr.len;
655 	is_tso = csum_flags & CSUM_TSO;
656 	has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
657 	    CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
658 	mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0;
659 	tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0;
660 
661 	eh = mtod(mbuf, struct ether_header *);
662 	KASSERT(eh->ether_type != ETHERTYPE_VLAN,
663 	    ("VLAN-tagged packets not supported"));
664 
665 	is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6;
666 	l3_off = ETHER_HDR_LEN;
667 	mbuf_next = m_getptr(mbuf, l3_off, &offset);
668 
669 	if (is_ipv6) {
670 		ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset));
671 		l4_off = l3_off + sizeof(struct ip6_hdr);
672 		is_tcp = (ip6->ip6_nxt == IPPROTO_TCP);
673 		is_udp = (ip6->ip6_nxt == IPPROTO_UDP);
674 		mbuf_next = m_getptr(mbuf, l4_off, &offset);
675 	} else if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
676 		ip = (struct ip *)(mtodo(mbuf_next, offset));
677 		l4_off = l3_off + (ip->ip_hl << 2);
678 		is_tcp = (ip->ip_p == IPPROTO_TCP);
679 		is_udp = (ip->ip_p == IPPROTO_UDP);
680 		mbuf_next = m_getptr(mbuf, l4_off, &offset);
681 	}
682 
683 	l4_data_off = 0;
684 	if (is_tcp) {
685 		th = (struct tcphdr *)(mtodo(mbuf_next, offset));
686 		l4_data_off = l4_off + (th->th_off << 2);
687 	} else if (is_udp)
688 		l4_data_off = l4_off + sizeof(struct udphdr);
689 
690 	if (has_csum_flag) {
691 		if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0)
692 			csum_offset = offsetof(struct tcphdr, th_sum);
693 		else
694 			csum_offset = offsetof(struct udphdr, uh_sum);
695 	}
696 
697 	/*
698 	 * If this packet is neither a TCP nor a UDP packet, the first segment,
699 	 * the one represented by the packet descriptor, will carry the
700 	 * spec-stipulated minimum of 182B.
701 	 */
702 	if (l4_data_off != 0)
703 		first_seg_len = l4_data_off;
704 	else
705 		first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES);
706 
707 	bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len);
708 	if (__predict_false(!gve_can_tx(tx, bytes_required))) {
709 		counter_enter();
710 		counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1);
711 		counter_exit();
712 		return (ENOBUFS);
713 	}
714 
715 	/* So that the cleanup taskqueue can free the mbuf eventually. */
716 	info->mbuf = mbuf;
717 
718 	gve_set_timestamp(&info->enqueue_time_sec);
719 
720 	/*
721 	 * We don't want to split the header, so if necessary, pad to the end
722 	 * of the fifo and then put the header at the beginning of the fifo.
723 	 */
724 	pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
725 	hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes,
726 	    &info->iov[0]);
727 	KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0"));
728 	payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len,
729 	    &info->iov[payload_iov]);
730 
731 	pkt_desc = &tx->desc_ring[idx].pkt;
732 	gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off,
733 	    1 + mtd_desc_nr + payload_nfrags, first_seg_len,
734 	    info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset,
735 	    pkt_len);
736 
737 	m_copydata(mbuf, 0, first_seg_len,
738 	    (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset);
739 	gve_dma_sync_for_device(tx->com.qpl,
740 	    info->iov[hdr_nfrags - 1].iov_offset,
741 	    info->iov[hdr_nfrags - 1].iov_len);
742 	copy_offset = first_seg_len;
743 
744 	if (mtd_desc_nr == 1) {
745 		next_idx = (tx->req + 1) & tx->mask;
746 		mtd_desc = &tx->desc_ring[next_idx].mtd;
747 		gve_tx_fill_mtd_desc(mtd_desc, mbuf);
748 	}
749 
750 	for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
751 		next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;
752 		seg_desc = &tx->desc_ring[next_idx].seg;
753 
754 		gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len,
755 		    info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss);
756 
757 		m_copydata(mbuf, copy_offset, info->iov[i].iov_len,
758 		    (char *)tx->fifo.base + info->iov[i].iov_offset);
759 		gve_dma_sync_for_device(tx->com.qpl,
760 		    info->iov[i].iov_offset, info->iov[i].iov_len);
761 		copy_offset += info->iov[i].iov_len;
762 	}
763 
764 	tx->req += (1 + mtd_desc_nr + payload_nfrags);
765 	if (is_tso) {
766 		counter_enter();
767 		counter_u64_add_protected(tx->stats.tso_packet_cnt, 1);
768 		counter_exit();
769 	}
770 	return (0);
771 }
772 
773 static int
gve_xmit_mbuf(struct gve_tx_ring * tx,struct mbuf ** mbuf)774 gve_xmit_mbuf(struct gve_tx_ring *tx,
775     struct mbuf **mbuf)
776 {
777 	if (gve_is_gqi(tx->com.priv))
778 		return (gve_xmit(tx, *mbuf));
779 
780 	if (gve_is_qpl(tx->com.priv))
781 		return (gve_xmit_dqo_qpl(tx, *mbuf));
782 
783 	/*
784 	 * gve_xmit_dqo might attempt to defrag the mbuf chain.
785 	 * The reference is passed in so that in the case of
786 	 * errors, the new mbuf chain is what's put back on the br.
787 	 */
788 	return (gve_xmit_dqo(tx, mbuf));
789 }
790 
791 /*
792  * Has the side-effect of stopping the xmit queue by setting tx->stopped
793  */
794 static int
gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring * tx,struct mbuf ** mbuf)795 gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx,
796     struct mbuf **mbuf)
797 {
798 	int err;
799 
800 	atomic_store_bool(&tx->stopped, true);
801 
802 	/*
803 	 * Room made in the queue BEFORE the barrier will be seen by the
804 	 * gve_xmit_mbuf retry below.
805 	 *
806 	 * If room is made in the queue AFTER the barrier, the cleanup tq
807 	 * iteration creating the room will either see a tx->stopped value
808 	 * of 0 or the 1 we just wrote:
809 	 *
810 	 *   If it sees a 1, then it would enqueue the xmit tq. Enqueue
811 	 *   implies a retry on the waiting pkt.
812 	 *
813 	 *   If it sees a 0, then that implies a previous iteration overwrote
814 	 *   our 1, and that iteration would enqueue the xmit tq. Enqueue
815 	 *   implies a retry on the waiting pkt.
816 	 */
817 	atomic_thread_fence_seq_cst();
818 
819 	err = gve_xmit_mbuf(tx, mbuf);
820 	if (err == 0)
821 		atomic_store_bool(&tx->stopped, false);
822 
823 	return (err);
824 }
825 
826 static void
gve_xmit_br(struct gve_tx_ring * tx)827 gve_xmit_br(struct gve_tx_ring *tx)
828 {
829 	struct gve_priv *priv = tx->com.priv;
830 	struct ifnet *ifp = priv->ifp;
831 	struct mbuf *mbuf;
832 	int err;
833 
834 	while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 &&
835 	    (mbuf = drbr_peek(ifp, tx->br)) != NULL) {
836 		err = gve_xmit_mbuf(tx, &mbuf);
837 
838 		/*
839 		 * We need to stop this taskqueue when we can't xmit the pkt due
840 		 * to lack of space in the NIC ring (ENOBUFS). The retry exists
841 		 * to guard against a TOCTTOU bug that could end up freezing the
842 		 * queue forever.
843 		 */
844 		if (__predict_false(mbuf != NULL && err == ENOBUFS))
845 			err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf);
846 
847 		if (__predict_false(err != 0 && mbuf != NULL)) {
848 			if (err == EINVAL) {
849 				drbr_advance(ifp, tx->br);
850 				m_freem(mbuf);
851 			} else
852 				drbr_putback(ifp, tx->br, mbuf);
853 			break;
854 		}
855 
856 		drbr_advance(ifp, tx->br);
857 		BPF_MTAP(ifp, mbuf);
858 
859 		bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
860 		    BUS_DMASYNC_PREWRITE);
861 
862 		if (gve_is_gqi(priv))
863 			gve_db_bar_write_4(priv, tx->com.db_offset, tx->req);
864 		else
865 			gve_db_bar_dqo_write_4(priv, tx->com.db_offset,
866 			    tx->dqo.desc_tail);
867 	}
868 }
869 
870 void
gve_xmit_tq(void * arg,int pending)871 gve_xmit_tq(void *arg, int pending)
872 {
873 	struct gve_tx_ring *tx = (struct gve_tx_ring *)arg;
874 
875 	GVE_RING_LOCK(tx);
876 	gve_xmit_br(tx);
877 	GVE_RING_UNLOCK(tx);
878 }
879 
880 static bool
is_vlan_tagged_pkt(struct mbuf * mbuf)881 is_vlan_tagged_pkt(struct mbuf *mbuf)
882 {
883 	struct ether_header *eh;
884 
885 	eh = mtod(mbuf, struct ether_header *);
886 	return (ntohs(eh->ether_type) == ETHERTYPE_VLAN);
887 }
888 
889 int
gve_xmit_ifp(if_t ifp,struct mbuf * mbuf)890 gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
891 {
892 	struct gve_priv *priv = if_getsoftc(ifp);
893 	struct gve_tx_ring *tx;
894 	bool is_br_empty;
895 	int err;
896 	uint32_t i;
897 
898 	if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
899 		return (ENODEV);
900 
901 	if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE)
902 		i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues;
903 	else
904 		i = curcpu % priv->tx_cfg.num_queues;
905 	tx = &priv->tx[i];
906 
907 	if (__predict_false(is_vlan_tagged_pkt(mbuf))) {
908 		counter_enter();
909 		counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1);
910 		counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
911 		counter_exit();
912 		m_freem(mbuf);
913 		return (ENODEV);
914 	}
915 
916 	is_br_empty = drbr_empty(ifp, tx->br);
917 	err = drbr_enqueue(ifp, tx->br, mbuf);
918 	if (__predict_false(err != 0)) {
919 		if (!atomic_load_bool(&tx->stopped))
920 			taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
921 		counter_enter();
922 		counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1);
923 		counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
924 		counter_exit();
925 		return (err);
926 	}
927 
928 	/*
929 	 * If the mbuf we just enqueued is the only one on the ring, then
930 	 * transmit it right away in the interests of low latency.
931 	 */
932 	if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) {
933 		gve_xmit_br(tx);
934 		GVE_RING_UNLOCK(tx);
935 	} else if (!atomic_load_bool(&tx->stopped))
936 		taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
937 
938 	return (0);
939 }
940 
941 void
gve_qflush(if_t ifp)942 gve_qflush(if_t ifp)
943 {
944 	struct gve_priv *priv = if_getsoftc(ifp);
945 	struct gve_tx_ring *tx;
946 	int i;
947 
948 	for (i = 0; i < priv->tx_cfg.num_queues; ++i) {
949 		tx = &priv->tx[i];
950 		if (drbr_empty(ifp, tx->br) == 0) {
951 			GVE_RING_LOCK(tx);
952 			drbr_flush(ifp, tx->br);
953 			GVE_RING_UNLOCK(tx);
954 		}
955 	}
956 
957 	if_qflush(ifp);
958 }
959