1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2023-2024 Google LLC
5 *
6 * Redistribution and use in source and binary forms, with or without modification,
7 * are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * 3. Neither the name of the copyright holder nor the names of its contributors
17 * may be used to endorse or promote products derived from this software without
18 * specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
24 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
27 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31 #include "gve.h"
32 #include "gve_adminq.h"
33 #include "gve_dqo.h"
34
35 #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182
36
37 static int
gve_tx_fifo_init(struct gve_priv * priv,struct gve_tx_ring * tx)38 gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx)
39 {
40 struct gve_queue_page_list *qpl = tx->com.qpl;
41 struct gve_tx_fifo *fifo = &tx->fifo;
42
43 fifo->size = qpl->num_pages * PAGE_SIZE;
44 fifo->base = qpl->kva;
45 atomic_store_int(&fifo->available, fifo->size);
46 fifo->head = 0;
47
48 return (0);
49 }
50
51 static void
gve_tx_free_ring_gqi(struct gve_priv * priv,int i)52 gve_tx_free_ring_gqi(struct gve_priv *priv, int i)
53 {
54 struct gve_tx_ring *tx = &priv->tx[i];
55 struct gve_ring_com *com = &tx->com;
56
57 if (tx->desc_ring != NULL) {
58 gve_dma_free_coherent(&tx->desc_ring_mem);
59 tx->desc_ring = NULL;
60 }
61
62 if (tx->info != NULL) {
63 free(tx->info, M_GVE);
64 tx->info = NULL;
65 }
66
67 if (com->qpl != NULL) {
68 gve_free_qpl(priv, com->qpl);
69 com->qpl = NULL;
70 }
71 }
72
73 static void
gve_tx_free_ring(struct gve_priv * priv,int i)74 gve_tx_free_ring(struct gve_priv *priv, int i)
75 {
76 struct gve_tx_ring *tx = &priv->tx[i];
77 struct gve_ring_com *com = &tx->com;
78
79 /* Safe to call even if never alloced */
80 gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
81
82 if (mtx_initialized(&tx->ring_mtx))
83 mtx_destroy(&tx->ring_mtx);
84
85 if (com->q_resources != NULL) {
86 gve_dma_free_coherent(&com->q_resources_mem);
87 com->q_resources = NULL;
88 }
89
90 if (tx->br != NULL) {
91 buf_ring_free(tx->br, M_DEVBUF);
92 tx->br = NULL;
93 }
94
95 if (gve_is_gqi(priv))
96 gve_tx_free_ring_gqi(priv, i);
97 else
98 gve_tx_free_ring_dqo(priv, i);
99 }
100
101 static int
gve_tx_alloc_ring_gqi(struct gve_priv * priv,int i)102 gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i)
103 {
104 struct gve_tx_ring *tx = &priv->tx[i];
105 struct gve_ring_com *com = &tx->com;
106 int err;
107
108 err = gve_dma_alloc_coherent(priv,
109 sizeof(union gve_tx_desc) * priv->tx_desc_cnt,
110 CACHE_LINE_SIZE, &tx->desc_ring_mem);
111 if (err != 0) {
112 device_printf(priv->dev,
113 "Failed to alloc desc ring for tx ring %d", i);
114 goto abort;
115 }
116 tx->desc_ring = tx->desc_ring_mem.cpu_addr;
117
118 com->qpl = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR,
119 /*single_kva=*/true);
120 if (com->qpl == NULL) {
121 device_printf(priv->dev,
122 "Failed to alloc QPL for tx ring %d\n", i);
123 err = ENOMEM;
124 goto abort;
125 }
126
127 err = gve_tx_fifo_init(priv, tx);
128 if (err != 0)
129 goto abort;
130
131 tx->info = malloc(
132 sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt,
133 M_GVE, M_WAITOK | M_ZERO);
134 return (0);
135
136 abort:
137 gve_tx_free_ring_gqi(priv, i);
138 return (err);
139 }
140
141 static int
gve_tx_alloc_ring(struct gve_priv * priv,int i)142 gve_tx_alloc_ring(struct gve_priv *priv, int i)
143 {
144 struct gve_tx_ring *tx = &priv->tx[i];
145 struct gve_ring_com *com = &tx->com;
146 char mtx_name[16];
147 int err;
148
149 com->priv = priv;
150 com->id = i;
151
152 if (gve_is_gqi(priv))
153 err = gve_tx_alloc_ring_gqi(priv, i);
154 else
155 err = gve_tx_alloc_ring_dqo(priv, i);
156 if (err != 0)
157 goto abort;
158
159 sprintf(mtx_name, "gvetx%d", i);
160 mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF);
161
162 tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF,
163 M_WAITOK, &tx->ring_mtx);
164
165 gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
166
167 err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources),
168 PAGE_SIZE, &com->q_resources_mem);
169 if (err != 0) {
170 device_printf(priv->dev,
171 "Failed to alloc queue resources for tx ring %d", i);
172 goto abort;
173 }
174 com->q_resources = com->q_resources_mem.cpu_addr;
175
176 return (0);
177
178 abort:
179 gve_tx_free_ring(priv, i);
180 return (err);
181 }
182
183 int
gve_alloc_tx_rings(struct gve_priv * priv,uint16_t start_idx,uint16_t stop_idx)184 gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx)
185 {
186 int i;
187 int err;
188
189 KASSERT(priv->tx != NULL, ("priv->tx is NULL!"));
190
191 for (i = start_idx; i < stop_idx; i++) {
192 err = gve_tx_alloc_ring(priv, i);
193 if (err != 0)
194 goto free_rings;
195 }
196
197 return (0);
198 free_rings:
199 gve_free_tx_rings(priv, start_idx, i);
200 return (err);
201 }
202
203 void
gve_free_tx_rings(struct gve_priv * priv,uint16_t start_idx,uint16_t stop_idx)204 gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx)
205 {
206 int i;
207
208 for (i = start_idx; i < stop_idx; i++)
209 gve_tx_free_ring(priv, i);
210 }
211
212 static void
gve_tx_clear_desc_ring(struct gve_tx_ring * tx)213 gve_tx_clear_desc_ring(struct gve_tx_ring *tx)
214 {
215 struct gve_ring_com *com = &tx->com;
216 int i;
217
218 for (i = 0; i < com->priv->tx_desc_cnt; i++) {
219 tx->desc_ring[i] = (union gve_tx_desc){};
220 tx->info[i] = (struct gve_tx_buffer_state){};
221 }
222
223 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
224 BUS_DMASYNC_PREWRITE);
225 }
226
227 static void
gve_clear_tx_ring(struct gve_priv * priv,int i)228 gve_clear_tx_ring(struct gve_priv *priv, int i)
229 {
230 struct gve_tx_ring *tx = &priv->tx[i];
231 struct gve_tx_fifo *fifo = &tx->fifo;
232
233 tx->req = 0;
234 tx->done = 0;
235 tx->mask = priv->tx_desc_cnt - 1;
236
237 atomic_store_int(&fifo->available, fifo->size);
238 fifo->head = 0;
239
240 gve_tx_clear_desc_ring(tx);
241 }
242
243 static void
gve_start_tx_ring(struct gve_priv * priv,int i)244 gve_start_tx_ring(struct gve_priv *priv, int i)
245 {
246 struct gve_tx_ring *tx = &priv->tx[i];
247 struct gve_ring_com *com = &tx->com;
248
249 atomic_store_bool(&tx->stopped, false);
250 if (gve_is_gqi(priv))
251 NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx);
252 else
253 NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx);
254 com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK,
255 taskqueue_thread_enqueue, &com->cleanup_tq);
256 taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d",
257 device_get_nameunit(priv->dev), i);
258
259 TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx);
260 tx->xmit_tq = taskqueue_create_fast("gve tx xmit",
261 M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq);
262 taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit",
263 device_get_nameunit(priv->dev), i);
264 }
265
266 int
gve_create_tx_rings(struct gve_priv * priv)267 gve_create_tx_rings(struct gve_priv *priv)
268 {
269 struct gve_ring_com *com;
270 struct gve_tx_ring *tx;
271 int err;
272 int i;
273
274 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK))
275 return (0);
276
277 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
278 if (gve_is_gqi(priv))
279 gve_clear_tx_ring(priv, i);
280 else
281 gve_clear_tx_ring_dqo(priv, i);
282 }
283
284 err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues);
285 if (err != 0)
286 return (err);
287
288 bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map,
289 BUS_DMASYNC_POSTREAD);
290
291 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
292 tx = &priv->tx[i];
293 com = &tx->com;
294
295 com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index);
296
297 bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map,
298 BUS_DMASYNC_POSTREAD);
299 com->db_offset = 4 * be32toh(com->q_resources->db_index);
300 com->counter_idx = be32toh(com->q_resources->counter_index);
301
302 gve_start_tx_ring(priv, i);
303 }
304
305 gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
306 return (0);
307 }
308
309 static void
gve_stop_tx_ring(struct gve_priv * priv,int i)310 gve_stop_tx_ring(struct gve_priv *priv, int i)
311 {
312 struct gve_tx_ring *tx = &priv->tx[i];
313 struct gve_ring_com *com = &tx->com;
314
315 if (com->cleanup_tq != NULL) {
316 taskqueue_quiesce(com->cleanup_tq);
317 taskqueue_free(com->cleanup_tq);
318 com->cleanup_tq = NULL;
319 }
320
321 if (tx->xmit_tq != NULL) {
322 taskqueue_quiesce(tx->xmit_tq);
323 taskqueue_free(tx->xmit_tq);
324 tx->xmit_tq = NULL;
325 }
326 }
327
328 int
gve_destroy_tx_rings(struct gve_priv * priv)329 gve_destroy_tx_rings(struct gve_priv *priv)
330 {
331 int err;
332 int i;
333
334 for (i = 0; i < priv->tx_cfg.num_queues; i++)
335 gve_stop_tx_ring(priv, i);
336
337 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) {
338 err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues);
339 if (err != 0)
340 return (err);
341 gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
342 }
343
344 return (0);
345 }
346
347 int
gve_tx_intr(void * arg)348 gve_tx_intr(void *arg)
349 {
350 struct gve_tx_ring *tx = arg;
351 struct gve_priv *priv = tx->com.priv;
352 struct gve_ring_com *com = &tx->com;
353
354 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
355 return (FILTER_STRAY);
356
357 gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK);
358 taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
359 return (FILTER_HANDLED);
360 }
361
362 static uint32_t
gve_tx_load_event_counter(struct gve_priv * priv,struct gve_tx_ring * tx)363 gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx)
364 {
365 bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map,
366 BUS_DMASYNC_POSTREAD);
367 uint32_t counter = priv->counters[tx->com.counter_idx];
368 return (be32toh(counter));
369 }
370
371 static void
gve_tx_free_fifo(struct gve_tx_fifo * fifo,size_t bytes)372 gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes)
373 {
374 atomic_add_int(&fifo->available, bytes);
375 }
376
377 void
gve_tx_cleanup_tq(void * arg,int pending)378 gve_tx_cleanup_tq(void *arg, int pending)
379 {
380 struct gve_tx_ring *tx = arg;
381 struct gve_priv *priv = tx->com.priv;
382 uint32_t nic_done = gve_tx_load_event_counter(priv, tx);
383 uint32_t todo = nic_done - tx->done;
384 size_t space_freed = 0;
385 int i, j;
386
387 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
388 return;
389
390 for (j = 0; j < todo; j++) {
391 uint32_t idx = tx->done & tx->mask;
392 struct gve_tx_buffer_state *info = &tx->info[idx];
393 struct mbuf *mbuf = info->mbuf;
394
395 tx->done++;
396 if (mbuf == NULL)
397 continue;
398
399 info->mbuf = NULL;
400 counter_enter();
401 counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len);
402 counter_u64_add_protected(tx->stats.tpackets, 1);
403 counter_exit();
404 m_freem(mbuf);
405
406 for (i = 0; i < GVE_TX_MAX_DESCS; i++) {
407 space_freed += info->iov[i].iov_len + info->iov[i].iov_padding;
408 info->iov[i].iov_len = 0;
409 info->iov[i].iov_padding = 0;
410 }
411 }
412
413 gve_tx_free_fifo(&tx->fifo, space_freed);
414
415 gve_db_bar_write_4(priv, tx->com.irq_db_offset,
416 GVE_IRQ_ACK | GVE_IRQ_EVENT);
417
418 /*
419 * Completions born before this barrier MAY NOT cause the NIC to send an
420 * interrupt but they will still be handled by the enqueue below.
421 * Completions born after the barrier WILL trigger an interrupt.
422 */
423 atomic_thread_fence_seq_cst();
424
425 nic_done = gve_tx_load_event_counter(priv, tx);
426 todo = nic_done - tx->done;
427 if (todo != 0) {
428 gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK);
429 taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
430 }
431
432 if (atomic_load_bool(&tx->stopped) && space_freed) {
433 atomic_store_bool(&tx->stopped, false);
434 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
435 }
436 }
437
438 static void
gve_dma_sync_for_device(struct gve_queue_page_list * qpl,uint64_t iov_offset,uint64_t iov_len)439 gve_dma_sync_for_device(struct gve_queue_page_list *qpl,
440 uint64_t iov_offset, uint64_t iov_len)
441 {
442 uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;
443 uint64_t first_page = iov_offset / PAGE_SIZE;
444 struct gve_dma_handle *dma;
445 uint64_t page;
446
447 for (page = first_page; page <= last_page; page++) {
448 dma = &(qpl->dmas[page]);
449 bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE);
450 }
451 }
452
453 static void
gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc * mtd_desc,struct mbuf * mbuf)454 gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf)
455 {
456 mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH;
457 mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4;
458 mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid);
459 mtd_desc->reserved0 = 0;
460 mtd_desc->reserved1 = 0;
461 }
462
463 static void
gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc * pkt_desc,bool is_tso,uint16_t l4_hdr_offset,uint32_t desc_cnt,uint16_t first_seg_len,uint64_t addr,bool has_csum_flag,int csum_offset,uint16_t pkt_len)464 gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso,
465 uint16_t l4_hdr_offset, uint32_t desc_cnt,
466 uint16_t first_seg_len, uint64_t addr, bool has_csum_flag,
467 int csum_offset, uint16_t pkt_len)
468 {
469 if (is_tso) {
470 pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM;
471 pkt_desc->l4_csum_offset = csum_offset >> 1;
472 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
473 } else if (has_csum_flag) {
474 pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM;
475 pkt_desc->l4_csum_offset = csum_offset >> 1;
476 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
477 } else {
478 pkt_desc->type_flags = GVE_TXD_STD;
479 pkt_desc->l4_csum_offset = 0;
480 pkt_desc->l4_hdr_offset = 0;
481 }
482 pkt_desc->desc_cnt = desc_cnt;
483 pkt_desc->len = htobe16(pkt_len);
484 pkt_desc->seg_len = htobe16(first_seg_len);
485 pkt_desc->seg_addr = htobe64(addr);
486 }
487
488 static void
gve_tx_fill_seg_desc(struct gve_tx_seg_desc * seg_desc,bool is_tso,uint16_t len,uint64_t addr,bool is_ipv6,uint8_t l3_off,uint16_t tso_mss)489 gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc,
490 bool is_tso, uint16_t len, uint64_t addr,
491 bool is_ipv6, uint8_t l3_off, uint16_t tso_mss)
492 {
493 seg_desc->type_flags = GVE_TXD_SEG;
494 if (is_tso) {
495 if (is_ipv6)
496 seg_desc->type_flags |= GVE_TXSF_IPV6;
497 seg_desc->l3_offset = l3_off >> 1;
498 seg_desc->mss = htobe16(tso_mss);
499 }
500 seg_desc->seg_len = htobe16(len);
501 seg_desc->seg_addr = htobe64(addr);
502 }
503
504 static inline uint32_t
gve_tx_avail(struct gve_tx_ring * tx)505 gve_tx_avail(struct gve_tx_ring *tx)
506 {
507 return (tx->mask + 1 - (tx->req - tx->done));
508 }
509
510 static bool
gve_tx_fifo_can_alloc(struct gve_tx_fifo * fifo,size_t bytes)511 gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes)
512 {
513 return (atomic_load_int(&fifo->available) >= bytes);
514 }
515
516 static inline bool
gve_can_tx(struct gve_tx_ring * tx,int bytes_required)517 gve_can_tx(struct gve_tx_ring *tx, int bytes_required)
518 {
519 return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) &&
520 gve_tx_fifo_can_alloc(&tx->fifo, bytes_required));
521 }
522
523 static int
gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo * fifo,size_t bytes)524 gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes)
525 {
526 return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head;
527 }
528
529 static inline int
gve_fifo_bytes_required(struct gve_tx_ring * tx,uint16_t first_seg_len,uint16_t pkt_len)530 gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len,
531 uint16_t pkt_len)
532 {
533 int pad_bytes, align_hdr_pad;
534 int bytes;
535
536 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
537 /* We need to take into account the header alignment padding. */
538 align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len;
539 bytes = align_hdr_pad + pad_bytes + pkt_len;
540
541 return (bytes);
542 }
543
544 static int
gve_tx_alloc_fifo(struct gve_tx_fifo * fifo,size_t bytes,struct gve_tx_iovec iov[2])545 gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes,
546 struct gve_tx_iovec iov[2])
547 {
548 size_t overflow, padding;
549 uint32_t aligned_head;
550 int nfrags = 0;
551
552 if (bytes == 0)
553 return (0);
554
555 /*
556 * This check happens before we know how much padding is needed to
557 * align to a cacheline boundary for the payload, but that is fine,
558 * because the FIFO head always start aligned, and the FIFO's boundaries
559 * are aligned, so if there is space for the data, there is space for
560 * the padding to the next alignment.
561 */
562 KASSERT(gve_tx_fifo_can_alloc(fifo, bytes),
563 ("Allocating gve tx fifo when there is no room"));
564
565 nfrags++;
566
567 iov[0].iov_offset = fifo->head;
568 iov[0].iov_len = bytes;
569 fifo->head += bytes;
570
571 if (fifo->head > fifo->size) {
572 /*
573 * If the allocation did not fit in the tail fragment of the
574 * FIFO, also use the head fragment.
575 */
576 nfrags++;
577 overflow = fifo->head - fifo->size;
578 iov[0].iov_len -= overflow;
579 iov[1].iov_offset = 0; /* Start of fifo*/
580 iov[1].iov_len = overflow;
581
582 fifo->head = overflow;
583 }
584
585 /* Re-align to a cacheline boundary */
586 aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE);
587 padding = aligned_head - fifo->head;
588 iov[nfrags - 1].iov_padding = padding;
589 atomic_add_int(&fifo->available, -(bytes + padding));
590 fifo->head = aligned_head;
591
592 if (fifo->head == fifo->size)
593 fifo->head = 0;
594
595 return (nfrags);
596 }
597
598 /* Only error this returns is ENOBUFS when the tx fifo is short of space */
599 static int
gve_xmit(struct gve_tx_ring * tx,struct mbuf * mbuf)600 gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
601 {
602 bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false;
603 int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset;
604 uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len;
605 int pad_bytes, hdr_nfrags, payload_nfrags;
606 struct gve_tx_pkt_desc *pkt_desc;
607 struct gve_tx_seg_desc *seg_desc;
608 struct gve_tx_mtd_desc *mtd_desc;
609 struct gve_tx_buffer_state *info;
610 uint32_t idx = tx->req & tx->mask;
611 struct ether_header *eh;
612 struct mbuf *mbuf_next;
613 int payload_iov = 2;
614 int bytes_required;
615 struct ip6_hdr *ip6;
616 struct tcphdr *th;
617 uint32_t next_idx;
618 uint8_t l3_off;
619 struct ip *ip;
620 int i;
621
622 info = &tx->info[idx];
623 csum_flags = mbuf->m_pkthdr.csum_flags;
624 pkt_len = mbuf->m_pkthdr.len;
625 is_tso = csum_flags & CSUM_TSO;
626 has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
627 CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
628 mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0;
629 tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0;
630
631 eh = mtod(mbuf, struct ether_header *);
632 KASSERT(eh->ether_type != ETHERTYPE_VLAN,
633 ("VLAN-tagged packets not supported"));
634
635 is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6;
636 l3_off = ETHER_HDR_LEN;
637 mbuf_next = m_getptr(mbuf, l3_off, &offset);
638
639 if (is_ipv6) {
640 ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset));
641 l4_off = l3_off + sizeof(struct ip6_hdr);
642 is_tcp = (ip6->ip6_nxt == IPPROTO_TCP);
643 is_udp = (ip6->ip6_nxt == IPPROTO_UDP);
644 mbuf_next = m_getptr(mbuf, l4_off, &offset);
645 } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
646 ip = (struct ip *)(mtodo(mbuf_next, offset));
647 l4_off = l3_off + (ip->ip_hl << 2);
648 is_tcp = (ip->ip_p == IPPROTO_TCP);
649 is_udp = (ip->ip_p == IPPROTO_UDP);
650 mbuf_next = m_getptr(mbuf, l4_off, &offset);
651 }
652
653 l4_data_off = 0;
654 if (is_tcp) {
655 th = (struct tcphdr *)(mtodo(mbuf_next, offset));
656 l4_data_off = l4_off + (th->th_off << 2);
657 } else if (is_udp)
658 l4_data_off = l4_off + sizeof(struct udphdr);
659
660 if (has_csum_flag) {
661 if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0)
662 csum_offset = offsetof(struct tcphdr, th_sum);
663 else
664 csum_offset = offsetof(struct udphdr, uh_sum);
665 }
666
667 /*
668 * If this packet is neither a TCP nor a UDP packet, the first segment,
669 * the one represented by the packet descriptor, will carry the
670 * spec-stipulated minimum of 182B.
671 */
672 if (l4_data_off != 0)
673 first_seg_len = l4_data_off;
674 else
675 first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES);
676
677 bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len);
678 if (__predict_false(!gve_can_tx(tx, bytes_required))) {
679 counter_enter();
680 counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1);
681 counter_exit();
682 return (ENOBUFS);
683 }
684
685 /* So that the cleanup taskqueue can free the mbuf eventually. */
686 info->mbuf = mbuf;
687
688 /*
689 * We don't want to split the header, so if necessary, pad to the end
690 * of the fifo and then put the header at the beginning of the fifo.
691 */
692 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
693 hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes,
694 &info->iov[0]);
695 KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0"));
696 payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len,
697 &info->iov[payload_iov]);
698
699 pkt_desc = &tx->desc_ring[idx].pkt;
700 gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off,
701 1 + mtd_desc_nr + payload_nfrags, first_seg_len,
702 info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset,
703 pkt_len);
704
705 m_copydata(mbuf, 0, first_seg_len,
706 (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset);
707 gve_dma_sync_for_device(tx->com.qpl,
708 info->iov[hdr_nfrags - 1].iov_offset,
709 info->iov[hdr_nfrags - 1].iov_len);
710 copy_offset = first_seg_len;
711
712 if (mtd_desc_nr == 1) {
713 next_idx = (tx->req + 1) & tx->mask;
714 mtd_desc = &tx->desc_ring[next_idx].mtd;
715 gve_tx_fill_mtd_desc(mtd_desc, mbuf);
716 }
717
718 for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
719 next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;
720 seg_desc = &tx->desc_ring[next_idx].seg;
721
722 gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len,
723 info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss);
724
725 m_copydata(mbuf, copy_offset, info->iov[i].iov_len,
726 (char *)tx->fifo.base + info->iov[i].iov_offset);
727 gve_dma_sync_for_device(tx->com.qpl,
728 info->iov[i].iov_offset, info->iov[i].iov_len);
729 copy_offset += info->iov[i].iov_len;
730 }
731
732 tx->req += (1 + mtd_desc_nr + payload_nfrags);
733 if (is_tso) {
734 counter_enter();
735 counter_u64_add_protected(tx->stats.tso_packet_cnt, 1);
736 counter_exit();
737 }
738 return (0);
739 }
740
741 static int
gve_xmit_mbuf(struct gve_tx_ring * tx,struct mbuf ** mbuf)742 gve_xmit_mbuf(struct gve_tx_ring *tx,
743 struct mbuf **mbuf)
744 {
745 if (gve_is_gqi(tx->com.priv))
746 return (gve_xmit(tx, *mbuf));
747
748 if (gve_is_qpl(tx->com.priv))
749 return (gve_xmit_dqo_qpl(tx, *mbuf));
750
751 /*
752 * gve_xmit_dqo might attempt to defrag the mbuf chain.
753 * The reference is passed in so that in the case of
754 * errors, the new mbuf chain is what's put back on the br.
755 */
756 return (gve_xmit_dqo(tx, mbuf));
757 }
758
759 /*
760 * Has the side-effect of stopping the xmit queue by setting tx->stopped
761 */
762 static int
gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring * tx,struct mbuf ** mbuf)763 gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx,
764 struct mbuf **mbuf)
765 {
766 int err;
767
768 atomic_store_bool(&tx->stopped, true);
769
770 /*
771 * Room made in the queue BEFORE the barrier will be seen by the
772 * gve_xmit_mbuf retry below.
773 *
774 * If room is made in the queue AFTER the barrier, the cleanup tq
775 * iteration creating the room will either see a tx->stopped value
776 * of 0 or the 1 we just wrote:
777 *
778 * If it sees a 1, then it would enqueue the xmit tq. Enqueue
779 * implies a retry on the waiting pkt.
780 *
781 * If it sees a 0, then that implies a previous iteration overwrote
782 * our 1, and that iteration would enqueue the xmit tq. Enqueue
783 * implies a retry on the waiting pkt.
784 */
785 atomic_thread_fence_seq_cst();
786
787 err = gve_xmit_mbuf(tx, mbuf);
788 if (err == 0)
789 atomic_store_bool(&tx->stopped, false);
790
791 return (err);
792 }
793
794 static void
gve_xmit_br(struct gve_tx_ring * tx)795 gve_xmit_br(struct gve_tx_ring *tx)
796 {
797 struct gve_priv *priv = tx->com.priv;
798 struct ifnet *ifp = priv->ifp;
799 struct mbuf *mbuf;
800 int err;
801
802 while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 &&
803 (mbuf = drbr_peek(ifp, tx->br)) != NULL) {
804 err = gve_xmit_mbuf(tx, &mbuf);
805
806 /*
807 * We need to stop this taskqueue when we can't xmit the pkt due
808 * to lack of space in the NIC ring (ENOBUFS). The retry exists
809 * to guard against a TOCTTOU bug that could end up freezing the
810 * queue forever.
811 */
812 if (__predict_false(mbuf != NULL && err == ENOBUFS))
813 err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf);
814
815 if (__predict_false(err != 0 && mbuf != NULL)) {
816 if (err == EINVAL) {
817 drbr_advance(ifp, tx->br);
818 m_freem(mbuf);
819 } else
820 drbr_putback(ifp, tx->br, mbuf);
821 break;
822 }
823
824 drbr_advance(ifp, tx->br);
825 BPF_MTAP(ifp, mbuf);
826
827 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
828 BUS_DMASYNC_PREWRITE);
829
830 if (gve_is_gqi(priv))
831 gve_db_bar_write_4(priv, tx->com.db_offset, tx->req);
832 else
833 gve_db_bar_dqo_write_4(priv, tx->com.db_offset,
834 tx->dqo.desc_tail);
835 }
836 }
837
838 void
gve_xmit_tq(void * arg,int pending)839 gve_xmit_tq(void *arg, int pending)
840 {
841 struct gve_tx_ring *tx = (struct gve_tx_ring *)arg;
842
843 GVE_RING_LOCK(tx);
844 gve_xmit_br(tx);
845 GVE_RING_UNLOCK(tx);
846 }
847
848 static bool
is_vlan_tagged_pkt(struct mbuf * mbuf)849 is_vlan_tagged_pkt(struct mbuf *mbuf)
850 {
851 struct ether_header *eh;
852
853 eh = mtod(mbuf, struct ether_header *);
854 return (ntohs(eh->ether_type) == ETHERTYPE_VLAN);
855 }
856
857 int
gve_xmit_ifp(if_t ifp,struct mbuf * mbuf)858 gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
859 {
860 struct gve_priv *priv = if_getsoftc(ifp);
861 struct gve_tx_ring *tx;
862 bool is_br_empty;
863 int err;
864 uint32_t i;
865
866 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
867 return (ENODEV);
868
869 if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE)
870 i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues;
871 else
872 i = curcpu % priv->tx_cfg.num_queues;
873 tx = &priv->tx[i];
874
875 if (__predict_false(is_vlan_tagged_pkt(mbuf))) {
876 counter_enter();
877 counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1);
878 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
879 counter_exit();
880 m_freem(mbuf);
881 return (ENODEV);
882 }
883
884 is_br_empty = drbr_empty(ifp, tx->br);
885 err = drbr_enqueue(ifp, tx->br, mbuf);
886 if (__predict_false(err != 0)) {
887 if (!atomic_load_bool(&tx->stopped))
888 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
889 counter_enter();
890 counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1);
891 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
892 counter_exit();
893 return (err);
894 }
895
896 /*
897 * If the mbuf we just enqueued is the only one on the ring, then
898 * transmit it right away in the interests of low latency.
899 */
900 if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) {
901 gve_xmit_br(tx);
902 GVE_RING_UNLOCK(tx);
903 } else if (!atomic_load_bool(&tx->stopped))
904 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
905
906 return (0);
907 }
908
909 void
gve_qflush(if_t ifp)910 gve_qflush(if_t ifp)
911 {
912 struct gve_priv *priv = if_getsoftc(ifp);
913 struct gve_tx_ring *tx;
914 int i;
915
916 for (i = 0; i < priv->tx_cfg.num_queues; ++i) {
917 tx = &priv->tx[i];
918 if (drbr_empty(ifp, tx->br) == 0) {
919 GVE_RING_LOCK(tx);
920 drbr_flush(ifp, tx->br);
921 GVE_RING_UNLOCK(tx);
922 }
923 }
924
925 if_qflush(ifp);
926 }
927