1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2023-2024 Google LLC
5 *
6 * Redistribution and use in source and binary forms, with or without modification,
7 * are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * 3. Neither the name of the copyright holder nor the names of its contributors
17 * may be used to endorse or promote products derived from this software without
18 * specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
24 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
27 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31 #include "gve.h"
32 #include "gve_adminq.h"
33 #include "gve_dqo.h"
34
35 #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182
36
37 static int
gve_tx_fifo_init(struct gve_priv * priv,struct gve_tx_ring * tx)38 gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx)
39 {
40 struct gve_queue_page_list *qpl = tx->com.qpl;
41 struct gve_tx_fifo *fifo = &tx->fifo;
42
43 fifo->size = qpl->num_pages * PAGE_SIZE;
44 fifo->base = qpl->kva;
45 atomic_store_int(&fifo->available, fifo->size);
46 fifo->head = 0;
47
48 return (0);
49 }
50
51 static void
gve_tx_free_ring_gqi(struct gve_priv * priv,int i)52 gve_tx_free_ring_gqi(struct gve_priv *priv, int i)
53 {
54 struct gve_tx_ring *tx = &priv->tx[i];
55
56 if (tx->desc_ring != NULL) {
57 gve_dma_free_coherent(&tx->desc_ring_mem);
58 tx->desc_ring = NULL;
59 }
60
61 if (tx->info != NULL) {
62 free(tx->info, M_GVE);
63 tx->info = NULL;
64 }
65 }
66
67 static void
gve_tx_free_ring(struct gve_priv * priv,int i)68 gve_tx_free_ring(struct gve_priv *priv, int i)
69 {
70 struct gve_tx_ring *tx = &priv->tx[i];
71 struct gve_ring_com *com = &tx->com;
72
73 /* Safe to call even if never alloced */
74 gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
75
76 if (mtx_initialized(&tx->ring_mtx))
77 mtx_destroy(&tx->ring_mtx);
78
79 if (com->q_resources != NULL) {
80 gve_dma_free_coherent(&com->q_resources_mem);
81 com->q_resources = NULL;
82 }
83
84 if (tx->br != NULL) {
85 buf_ring_free(tx->br, M_DEVBUF);
86 tx->br = NULL;
87 }
88
89 if (gve_is_gqi(priv))
90 gve_tx_free_ring_gqi(priv, i);
91 else
92 gve_tx_free_ring_dqo(priv, i);
93 }
94
95 static int
gve_tx_alloc_ring_gqi(struct gve_priv * priv,int i)96 gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i)
97 {
98 struct gve_tx_ring *tx = &priv->tx[i];
99 struct gve_ring_com *com = &tx->com;
100 int err;
101
102 err = gve_dma_alloc_coherent(priv,
103 sizeof(union gve_tx_desc) * priv->tx_desc_cnt,
104 CACHE_LINE_SIZE, &tx->desc_ring_mem);
105 if (err != 0) {
106 device_printf(priv->dev,
107 "Failed to alloc desc ring for tx ring %d", i);
108 goto abort;
109 }
110 tx->desc_ring = tx->desc_ring_mem.cpu_addr;
111
112 com->qpl = &priv->qpls[i];
113 if (com->qpl == NULL) {
114 device_printf(priv->dev, "No QPL left for tx ring %d\n", i);
115 err = ENOMEM;
116 goto abort;
117 }
118
119 err = gve_tx_fifo_init(priv, tx);
120 if (err != 0)
121 goto abort;
122
123 tx->info = malloc(
124 sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt,
125 M_GVE, M_WAITOK | M_ZERO);
126 return (0);
127
128 abort:
129 gve_tx_free_ring_gqi(priv, i);
130 return (err);
131 }
132
133 static int
gve_tx_alloc_ring(struct gve_priv * priv,int i)134 gve_tx_alloc_ring(struct gve_priv *priv, int i)
135 {
136 struct gve_tx_ring *tx = &priv->tx[i];
137 struct gve_ring_com *com = &tx->com;
138 char mtx_name[16];
139 int err;
140
141 com->priv = priv;
142 com->id = i;
143
144 if (gve_is_gqi(priv))
145 err = gve_tx_alloc_ring_gqi(priv, i);
146 else
147 err = gve_tx_alloc_ring_dqo(priv, i);
148 if (err != 0)
149 goto abort;
150
151 sprintf(mtx_name, "gvetx%d", i);
152 mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF);
153
154 tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF,
155 M_WAITOK, &tx->ring_mtx);
156
157 gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
158
159 err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources),
160 PAGE_SIZE, &com->q_resources_mem);
161 if (err != 0) {
162 device_printf(priv->dev,
163 "Failed to alloc queue resources for tx ring %d", i);
164 goto abort;
165 }
166 com->q_resources = com->q_resources_mem.cpu_addr;
167
168 return (0);
169
170 abort:
171 gve_tx_free_ring(priv, i);
172 return (err);
173 }
174
175 int
gve_alloc_tx_rings(struct gve_priv * priv)176 gve_alloc_tx_rings(struct gve_priv *priv)
177 {
178 int err = 0;
179 int i;
180
181 priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues,
182 M_GVE, M_WAITOK | M_ZERO);
183
184 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
185 err = gve_tx_alloc_ring(priv, i);
186 if (err != 0)
187 goto free_rings;
188
189 }
190
191 return (0);
192
193 free_rings:
194 while (i--)
195 gve_tx_free_ring(priv, i);
196 free(priv->tx, M_GVE);
197 return (err);
198 }
199
200 void
gve_free_tx_rings(struct gve_priv * priv)201 gve_free_tx_rings(struct gve_priv *priv)
202 {
203 int i;
204
205 for (i = 0; i < priv->tx_cfg.num_queues; i++)
206 gve_tx_free_ring(priv, i);
207
208 free(priv->tx, M_GVE);
209 }
210
211 static void
gve_tx_clear_desc_ring(struct gve_tx_ring * tx)212 gve_tx_clear_desc_ring(struct gve_tx_ring *tx)
213 {
214 struct gve_ring_com *com = &tx->com;
215 int i;
216
217 for (i = 0; i < com->priv->tx_desc_cnt; i++) {
218 tx->desc_ring[i] = (union gve_tx_desc){};
219 tx->info[i] = (struct gve_tx_buffer_state){};
220 }
221
222 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
223 BUS_DMASYNC_PREWRITE);
224 }
225
226 static void
gve_clear_tx_ring(struct gve_priv * priv,int i)227 gve_clear_tx_ring(struct gve_priv *priv, int i)
228 {
229 struct gve_tx_ring *tx = &priv->tx[i];
230 struct gve_tx_fifo *fifo = &tx->fifo;
231
232 tx->req = 0;
233 tx->done = 0;
234 tx->mask = priv->tx_desc_cnt - 1;
235
236 atomic_store_int(&fifo->available, fifo->size);
237 fifo->head = 0;
238
239 gve_tx_clear_desc_ring(tx);
240 }
241
242 static void
gve_start_tx_ring(struct gve_priv * priv,int i,void (cleanup)(void * arg,int pending))243 gve_start_tx_ring(struct gve_priv *priv, int i,
244 void (cleanup) (void *arg, int pending))
245 {
246 struct gve_tx_ring *tx = &priv->tx[i];
247 struct gve_ring_com *com = &tx->com;
248
249 atomic_store_bool(&tx->stopped, false);
250
251 NET_TASK_INIT(&com->cleanup_task, 0, cleanup, tx);
252 com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK,
253 taskqueue_thread_enqueue, &com->cleanup_tq);
254 taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d",
255 device_get_nameunit(priv->dev), i);
256
257 TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx);
258 tx->xmit_tq = taskqueue_create_fast("gve tx xmit",
259 M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq);
260 taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit",
261 device_get_nameunit(priv->dev), i);
262 }
263
264 int
gve_create_tx_rings(struct gve_priv * priv)265 gve_create_tx_rings(struct gve_priv *priv)
266 {
267 struct gve_ring_com *com;
268 struct gve_tx_ring *tx;
269 int err;
270 int i;
271
272 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK))
273 return (0);
274
275 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
276 if (gve_is_gqi(priv))
277 gve_clear_tx_ring(priv, i);
278 else
279 gve_clear_tx_ring_dqo(priv, i);
280 }
281
282 err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues);
283 if (err != 0)
284 return (err);
285
286 bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map,
287 BUS_DMASYNC_POSTREAD);
288
289 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
290 tx = &priv->tx[i];
291 com = &tx->com;
292
293 com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index);
294
295 bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map,
296 BUS_DMASYNC_POSTREAD);
297 com->db_offset = 4 * be32toh(com->q_resources->db_index);
298 com->counter_idx = be32toh(com->q_resources->counter_index);
299
300 if (gve_is_gqi(priv))
301 gve_start_tx_ring(priv, i, gve_tx_cleanup_tq);
302 else
303 gve_start_tx_ring(priv, i, gve_tx_cleanup_tq_dqo);
304 }
305
306 gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
307 return (0);
308 }
309
310 static void
gve_stop_tx_ring(struct gve_priv * priv,int i)311 gve_stop_tx_ring(struct gve_priv *priv, int i)
312 {
313 struct gve_tx_ring *tx = &priv->tx[i];
314 struct gve_ring_com *com = &tx->com;
315
316 if (com->cleanup_tq != NULL) {
317 taskqueue_quiesce(com->cleanup_tq);
318 taskqueue_free(com->cleanup_tq);
319 com->cleanup_tq = NULL;
320 }
321
322 if (tx->xmit_tq != NULL) {
323 taskqueue_quiesce(tx->xmit_tq);
324 taskqueue_free(tx->xmit_tq);
325 tx->xmit_tq = NULL;
326 }
327 }
328
329 int
gve_destroy_tx_rings(struct gve_priv * priv)330 gve_destroy_tx_rings(struct gve_priv *priv)
331 {
332 int err;
333 int i;
334
335 for (i = 0; i < priv->tx_cfg.num_queues; i++)
336 gve_stop_tx_ring(priv, i);
337
338 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) {
339 err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues);
340 if (err != 0)
341 return (err);
342 gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
343 }
344
345 return (0);
346 }
347
348 int
gve_tx_intr(void * arg)349 gve_tx_intr(void *arg)
350 {
351 struct gve_tx_ring *tx = arg;
352 struct gve_priv *priv = tx->com.priv;
353 struct gve_ring_com *com = &tx->com;
354
355 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
356 return (FILTER_STRAY);
357
358 gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK);
359 taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
360 return (FILTER_HANDLED);
361 }
362
363 static uint32_t
gve_tx_load_event_counter(struct gve_priv * priv,struct gve_tx_ring * tx)364 gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx)
365 {
366 bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map,
367 BUS_DMASYNC_POSTREAD);
368 uint32_t counter = priv->counters[tx->com.counter_idx];
369 return (be32toh(counter));
370 }
371
372 static void
gve_tx_free_fifo(struct gve_tx_fifo * fifo,size_t bytes)373 gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes)
374 {
375 atomic_add_int(&fifo->available, bytes);
376 }
377
378 void
gve_tx_cleanup_tq(void * arg,int pending)379 gve_tx_cleanup_tq(void *arg, int pending)
380 {
381 struct gve_tx_ring *tx = arg;
382 struct gve_priv *priv = tx->com.priv;
383 uint32_t nic_done = gve_tx_load_event_counter(priv, tx);
384 uint32_t todo = nic_done - tx->done;
385 size_t space_freed = 0;
386 int i, j;
387
388 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
389 return;
390
391 for (j = 0; j < todo; j++) {
392 uint32_t idx = tx->done & tx->mask;
393 struct gve_tx_buffer_state *info = &tx->info[idx];
394 struct mbuf *mbuf = info->mbuf;
395
396 tx->done++;
397 if (mbuf == NULL)
398 continue;
399
400 info->mbuf = NULL;
401 counter_enter();
402 counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len);
403 counter_u64_add_protected(tx->stats.tpackets, 1);
404 counter_exit();
405 m_freem(mbuf);
406
407 for (i = 0; i < GVE_TX_MAX_DESCS; i++) {
408 space_freed += info->iov[i].iov_len + info->iov[i].iov_padding;
409 info->iov[i].iov_len = 0;
410 info->iov[i].iov_padding = 0;
411 }
412 }
413
414 gve_tx_free_fifo(&tx->fifo, space_freed);
415
416 gve_db_bar_write_4(priv, tx->com.irq_db_offset,
417 GVE_IRQ_ACK | GVE_IRQ_EVENT);
418
419 /*
420 * Completions born before this barrier MAY NOT cause the NIC to send an
421 * interrupt but they will still be handled by the enqueue below.
422 * Completions born after the barrier WILL trigger an interrupt.
423 */
424 mb();
425
426 nic_done = gve_tx_load_event_counter(priv, tx);
427 todo = nic_done - tx->done;
428 if (todo != 0) {
429 gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK);
430 taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
431 }
432
433 if (atomic_load_bool(&tx->stopped) && space_freed) {
434 atomic_store_bool(&tx->stopped, false);
435 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
436 }
437 }
438
439 static void
gve_dma_sync_for_device(struct gve_queue_page_list * qpl,uint64_t iov_offset,uint64_t iov_len)440 gve_dma_sync_for_device(struct gve_queue_page_list *qpl,
441 uint64_t iov_offset, uint64_t iov_len)
442 {
443 uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;
444 uint64_t first_page = iov_offset / PAGE_SIZE;
445 struct gve_dma_handle *dma;
446 uint64_t page;
447
448 for (page = first_page; page <= last_page; page++) {
449 dma = &(qpl->dmas[page]);
450 bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE);
451 }
452 }
453
454 static void
gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc * mtd_desc,struct mbuf * mbuf)455 gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf)
456 {
457 mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH;
458 mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4;
459 mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid);
460 mtd_desc->reserved0 = 0;
461 mtd_desc->reserved1 = 0;
462 }
463
464 static void
gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc * pkt_desc,bool is_tso,uint16_t l4_hdr_offset,uint32_t desc_cnt,uint16_t first_seg_len,uint64_t addr,bool has_csum_flag,int csum_offset,uint16_t pkt_len)465 gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso,
466 uint16_t l4_hdr_offset, uint32_t desc_cnt,
467 uint16_t first_seg_len, uint64_t addr, bool has_csum_flag,
468 int csum_offset, uint16_t pkt_len)
469 {
470 if (is_tso) {
471 pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM;
472 pkt_desc->l4_csum_offset = csum_offset >> 1;
473 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
474 } else if (has_csum_flag) {
475 pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM;
476 pkt_desc->l4_csum_offset = csum_offset >> 1;
477 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
478 } else {
479 pkt_desc->type_flags = GVE_TXD_STD;
480 pkt_desc->l4_csum_offset = 0;
481 pkt_desc->l4_hdr_offset = 0;
482 }
483 pkt_desc->desc_cnt = desc_cnt;
484 pkt_desc->len = htobe16(pkt_len);
485 pkt_desc->seg_len = htobe16(first_seg_len);
486 pkt_desc->seg_addr = htobe64(addr);
487 }
488
489 static void
gve_tx_fill_seg_desc(struct gve_tx_seg_desc * seg_desc,bool is_tso,uint16_t len,uint64_t addr,bool is_ipv6,uint8_t l3_off,uint16_t tso_mss)490 gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc,
491 bool is_tso, uint16_t len, uint64_t addr,
492 bool is_ipv6, uint8_t l3_off, uint16_t tso_mss)
493 {
494 seg_desc->type_flags = GVE_TXD_SEG;
495 if (is_tso) {
496 if (is_ipv6)
497 seg_desc->type_flags |= GVE_TXSF_IPV6;
498 seg_desc->l3_offset = l3_off >> 1;
499 seg_desc->mss = htobe16(tso_mss);
500 }
501 seg_desc->seg_len = htobe16(len);
502 seg_desc->seg_addr = htobe64(addr);
503 }
504
505 static inline uint32_t
gve_tx_avail(struct gve_tx_ring * tx)506 gve_tx_avail(struct gve_tx_ring *tx)
507 {
508 return (tx->mask + 1 - (tx->req - tx->done));
509 }
510
511 static bool
gve_tx_fifo_can_alloc(struct gve_tx_fifo * fifo,size_t bytes)512 gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes)
513 {
514 return (atomic_load_int(&fifo->available) >= bytes);
515 }
516
517 static inline bool
gve_can_tx(struct gve_tx_ring * tx,int bytes_required)518 gve_can_tx(struct gve_tx_ring *tx, int bytes_required)
519 {
520 return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) &&
521 gve_tx_fifo_can_alloc(&tx->fifo, bytes_required));
522 }
523
524 static int
gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo * fifo,size_t bytes)525 gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes)
526 {
527 return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head;
528 }
529
530 static inline int
gve_fifo_bytes_required(struct gve_tx_ring * tx,uint16_t first_seg_len,uint16_t pkt_len)531 gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len,
532 uint16_t pkt_len)
533 {
534 int pad_bytes, align_hdr_pad;
535 int bytes;
536
537 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
538 /* We need to take into account the header alignment padding. */
539 align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len;
540 bytes = align_hdr_pad + pad_bytes + pkt_len;
541
542 return (bytes);
543 }
544
545 static int
gve_tx_alloc_fifo(struct gve_tx_fifo * fifo,size_t bytes,struct gve_tx_iovec iov[2])546 gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes,
547 struct gve_tx_iovec iov[2])
548 {
549 size_t overflow, padding;
550 uint32_t aligned_head;
551 int nfrags = 0;
552
553 if (bytes == 0)
554 return (0);
555
556 /*
557 * This check happens before we know how much padding is needed to
558 * align to a cacheline boundary for the payload, but that is fine,
559 * because the FIFO head always start aligned, and the FIFO's boundaries
560 * are aligned, so if there is space for the data, there is space for
561 * the padding to the next alignment.
562 */
563 KASSERT(gve_tx_fifo_can_alloc(fifo, bytes),
564 ("Allocating gve tx fifo when there is no room"));
565
566 nfrags++;
567
568 iov[0].iov_offset = fifo->head;
569 iov[0].iov_len = bytes;
570 fifo->head += bytes;
571
572 if (fifo->head > fifo->size) {
573 /*
574 * If the allocation did not fit in the tail fragment of the
575 * FIFO, also use the head fragment.
576 */
577 nfrags++;
578 overflow = fifo->head - fifo->size;
579 iov[0].iov_len -= overflow;
580 iov[1].iov_offset = 0; /* Start of fifo*/
581 iov[1].iov_len = overflow;
582
583 fifo->head = overflow;
584 }
585
586 /* Re-align to a cacheline boundary */
587 aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE);
588 padding = aligned_head - fifo->head;
589 iov[nfrags - 1].iov_padding = padding;
590 atomic_add_int(&fifo->available, -(bytes + padding));
591 fifo->head = aligned_head;
592
593 if (fifo->head == fifo->size)
594 fifo->head = 0;
595
596 return (nfrags);
597 }
598
599 /* Only error this returns is ENOBUFS when the tx fifo is short of space */
600 static int
gve_xmit(struct gve_tx_ring * tx,struct mbuf * mbuf)601 gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
602 {
603 bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false;
604 int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset;
605 uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len;
606 int pad_bytes, hdr_nfrags, payload_nfrags;
607 struct gve_tx_pkt_desc *pkt_desc;
608 struct gve_tx_seg_desc *seg_desc;
609 struct gve_tx_mtd_desc *mtd_desc;
610 struct gve_tx_buffer_state *info;
611 uint32_t idx = tx->req & tx->mask;
612 struct ether_header *eh;
613 struct mbuf *mbuf_next;
614 int payload_iov = 2;
615 int bytes_required;
616 struct ip6_hdr *ip6;
617 struct tcphdr *th;
618 uint32_t next_idx;
619 uint8_t l3_off;
620 struct ip *ip;
621 int i;
622
623 info = &tx->info[idx];
624 csum_flags = mbuf->m_pkthdr.csum_flags;
625 pkt_len = mbuf->m_pkthdr.len;
626 is_tso = csum_flags & CSUM_TSO;
627 has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
628 CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
629 mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0;
630 tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0;
631
632 eh = mtod(mbuf, struct ether_header *);
633 KASSERT(eh->ether_type != ETHERTYPE_VLAN,
634 ("VLAN-tagged packets not supported"));
635
636 is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6;
637 l3_off = ETHER_HDR_LEN;
638 mbuf_next = m_getptr(mbuf, l3_off, &offset);
639
640 if (is_ipv6) {
641 ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset));
642 l4_off = l3_off + sizeof(struct ip6_hdr);
643 is_tcp = (ip6->ip6_nxt == IPPROTO_TCP);
644 is_udp = (ip6->ip6_nxt == IPPROTO_UDP);
645 mbuf_next = m_getptr(mbuf, l4_off, &offset);
646 } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
647 ip = (struct ip *)(mtodo(mbuf_next, offset));
648 l4_off = l3_off + (ip->ip_hl << 2);
649 is_tcp = (ip->ip_p == IPPROTO_TCP);
650 is_udp = (ip->ip_p == IPPROTO_UDP);
651 mbuf_next = m_getptr(mbuf, l4_off, &offset);
652 }
653
654 l4_data_off = 0;
655 if (is_tcp) {
656 th = (struct tcphdr *)(mtodo(mbuf_next, offset));
657 l4_data_off = l4_off + (th->th_off << 2);
658 } else if (is_udp)
659 l4_data_off = l4_off + sizeof(struct udphdr);
660
661 if (has_csum_flag) {
662 if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0)
663 csum_offset = offsetof(struct tcphdr, th_sum);
664 else
665 csum_offset = offsetof(struct udphdr, uh_sum);
666 }
667
668 /*
669 * If this packet is neither a TCP nor a UDP packet, the first segment,
670 * the one represented by the packet descriptor, will carry the
671 * spec-stipulated minimum of 182B.
672 */
673 if (l4_data_off != 0)
674 first_seg_len = l4_data_off;
675 else
676 first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES);
677
678 bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len);
679 if (__predict_false(!gve_can_tx(tx, bytes_required))) {
680 counter_enter();
681 counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1);
682 counter_exit();
683 return (ENOBUFS);
684 }
685
686 /* So that the cleanup taskqueue can free the mbuf eventually. */
687 info->mbuf = mbuf;
688
689 /*
690 * We don't want to split the header, so if necessary, pad to the end
691 * of the fifo and then put the header at the beginning of the fifo.
692 */
693 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
694 hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes,
695 &info->iov[0]);
696 KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0"));
697 payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len,
698 &info->iov[payload_iov]);
699
700 pkt_desc = &tx->desc_ring[idx].pkt;
701 gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off,
702 1 + mtd_desc_nr + payload_nfrags, first_seg_len,
703 info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset,
704 pkt_len);
705
706 m_copydata(mbuf, 0, first_seg_len,
707 (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset);
708 gve_dma_sync_for_device(tx->com.qpl,
709 info->iov[hdr_nfrags - 1].iov_offset,
710 info->iov[hdr_nfrags - 1].iov_len);
711 copy_offset = first_seg_len;
712
713 if (mtd_desc_nr == 1) {
714 next_idx = (tx->req + 1) & tx->mask;
715 mtd_desc = &tx->desc_ring[next_idx].mtd;
716 gve_tx_fill_mtd_desc(mtd_desc, mbuf);
717 }
718
719 for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
720 next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;
721 seg_desc = &tx->desc_ring[next_idx].seg;
722
723 gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len,
724 info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss);
725
726 m_copydata(mbuf, copy_offset, info->iov[i].iov_len,
727 (char *)tx->fifo.base + info->iov[i].iov_offset);
728 gve_dma_sync_for_device(tx->com.qpl,
729 info->iov[i].iov_offset, info->iov[i].iov_len);
730 copy_offset += info->iov[i].iov_len;
731 }
732
733 tx->req += (1 + mtd_desc_nr + payload_nfrags);
734 if (is_tso) {
735 counter_enter();
736 counter_u64_add_protected(tx->stats.tso_packet_cnt, 1);
737 counter_exit();
738 }
739 return (0);
740 }
741
742 static int
gve_xmit_mbuf(struct gve_tx_ring * tx,struct mbuf ** mbuf)743 gve_xmit_mbuf(struct gve_tx_ring *tx,
744 struct mbuf **mbuf)
745 {
746 if (gve_is_gqi(tx->com.priv))
747 return (gve_xmit(tx, *mbuf));
748
749 if (gve_is_qpl(tx->com.priv))
750 return (gve_xmit_dqo_qpl(tx, *mbuf));
751
752 /*
753 * gve_xmit_dqo might attempt to defrag the mbuf chain.
754 * The reference is passed in so that in the case of
755 * errors, the new mbuf chain is what's put back on the br.
756 */
757 return (gve_xmit_dqo(tx, mbuf));
758 }
759
760 /*
761 * Has the side-effect of stopping the xmit queue by setting tx->stopped
762 */
763 static int
gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring * tx,struct mbuf ** mbuf)764 gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx,
765 struct mbuf **mbuf)
766 {
767 int err;
768
769 atomic_store_bool(&tx->stopped, true);
770
771 /*
772 * Room made in the queue BEFORE the barrier will be seen by the
773 * gve_xmit_mbuf retry below.
774 *
775 * If room is made in the queue AFTER the barrier, the cleanup tq
776 * iteration creating the room will either see a tx->stopped value
777 * of 0 or the 1 we just wrote:
778 *
779 * If it sees a 1, then it would enqueue the xmit tq. Enqueue
780 * implies a retry on the waiting pkt.
781 *
782 * If it sees a 0, then that implies a previous iteration overwrote
783 * our 1, and that iteration would enqueue the xmit tq. Enqueue
784 * implies a retry on the waiting pkt.
785 */
786 atomic_thread_fence_seq_cst();
787
788 err = gve_xmit_mbuf(tx, mbuf);
789 if (err == 0)
790 atomic_store_bool(&tx->stopped, false);
791
792 return (err);
793 }
794
795 static void
gve_xmit_br(struct gve_tx_ring * tx)796 gve_xmit_br(struct gve_tx_ring *tx)
797 {
798 struct gve_priv *priv = tx->com.priv;
799 struct ifnet *ifp = priv->ifp;
800 struct mbuf *mbuf;
801 int err;
802
803 while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 &&
804 (mbuf = drbr_peek(ifp, tx->br)) != NULL) {
805 err = gve_xmit_mbuf(tx, &mbuf);
806
807 /*
808 * We need to stop this taskqueue when we can't xmit the pkt due
809 * to lack of space in the NIC ring (ENOBUFS). The retry exists
810 * to guard against a TOCTTOU bug that could end up freezing the
811 * queue forever.
812 */
813 if (__predict_false(mbuf != NULL && err == ENOBUFS))
814 err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf);
815
816 if (__predict_false(err != 0 && mbuf != NULL)) {
817 if (err == EINVAL) {
818 drbr_advance(ifp, tx->br);
819 m_freem(mbuf);
820 } else
821 drbr_putback(ifp, tx->br, mbuf);
822 break;
823 }
824
825 drbr_advance(ifp, tx->br);
826 BPF_MTAP(ifp, mbuf);
827
828 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
829 BUS_DMASYNC_PREWRITE);
830
831 if (gve_is_gqi(priv))
832 gve_db_bar_write_4(priv, tx->com.db_offset, tx->req);
833 else
834 gve_db_bar_dqo_write_4(priv, tx->com.db_offset,
835 tx->dqo.desc_tail);
836 }
837 }
838
839 void
gve_xmit_tq(void * arg,int pending)840 gve_xmit_tq(void *arg, int pending)
841 {
842 struct gve_tx_ring *tx = (struct gve_tx_ring *)arg;
843
844 GVE_RING_LOCK(tx);
845 gve_xmit_br(tx);
846 GVE_RING_UNLOCK(tx);
847 }
848
849 static bool
is_vlan_tagged_pkt(struct mbuf * mbuf)850 is_vlan_tagged_pkt(struct mbuf *mbuf)
851 {
852 struct ether_header *eh;
853
854 eh = mtod(mbuf, struct ether_header *);
855 return (ntohs(eh->ether_type) == ETHERTYPE_VLAN);
856 }
857
858 int
gve_xmit_ifp(if_t ifp,struct mbuf * mbuf)859 gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
860 {
861 struct gve_priv *priv = if_getsoftc(ifp);
862 struct gve_tx_ring *tx;
863 bool is_br_empty;
864 int err;
865 uint32_t i;
866
867 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
868 return (ENODEV);
869
870 if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE)
871 i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues;
872 else
873 i = curcpu % priv->tx_cfg.num_queues;
874 tx = &priv->tx[i];
875
876 if (__predict_false(is_vlan_tagged_pkt(mbuf))) {
877 counter_enter();
878 counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1);
879 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
880 counter_exit();
881 m_freem(mbuf);
882 return (ENODEV);
883 }
884
885 is_br_empty = drbr_empty(ifp, tx->br);
886 err = drbr_enqueue(ifp, tx->br, mbuf);
887 if (__predict_false(err != 0)) {
888 if (!atomic_load_bool(&tx->stopped))
889 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
890 counter_enter();
891 counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1);
892 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
893 counter_exit();
894 return (err);
895 }
896
897 /*
898 * If the mbuf we just enqueued is the only one on the ring, then
899 * transmit it right away in the interests of low latency.
900 */
901 if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) {
902 gve_xmit_br(tx);
903 GVE_RING_UNLOCK(tx);
904 } else if (!atomic_load_bool(&tx->stopped))
905 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
906
907 return (0);
908 }
909
910 void
gve_qflush(if_t ifp)911 gve_qflush(if_t ifp)
912 {
913 struct gve_priv *priv = if_getsoftc(ifp);
914 struct gve_tx_ring *tx;
915 int i;
916
917 for (i = 0; i < priv->tx_cfg.num_queues; ++i) {
918 tx = &priv->tx[i];
919 if (drbr_empty(ifp, tx->br) == 0) {
920 GVE_RING_LOCK(tx);
921 drbr_flush(ifp, tx->br);
922 GVE_RING_UNLOCK(tx);
923 }
924 }
925
926 if_qflush(ifp);
927 }
928