1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2023-2024 Google LLC
5 *
6 * Redistribution and use in source and binary forms, with or without modification,
7 * are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * 3. Neither the name of the copyright holder nor the names of its contributors
17 * may be used to endorse or promote products derived from this software without
18 * specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
24 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
27 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31 #include "gve.h"
32 #include "gve_adminq.h"
33 #include "gve_dqo.h"
34
35 #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182
36
37 static int
gve_tx_fifo_init(struct gve_priv * priv,struct gve_tx_ring * tx)38 gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx)
39 {
40 struct gve_queue_page_list *qpl = tx->com.qpl;
41 struct gve_tx_fifo *fifo = &tx->fifo;
42
43 fifo->size = qpl->num_pages * PAGE_SIZE;
44 fifo->base = qpl->kva;
45 atomic_store_int(&fifo->available, fifo->size);
46 fifo->head = 0;
47
48 return (0);
49 }
50
51 static void
gve_tx_free_ring_gqi(struct gve_priv * priv,int i)52 gve_tx_free_ring_gqi(struct gve_priv *priv, int i)
53 {
54 struct gve_tx_ring *tx = &priv->tx[i];
55
56 if (tx->desc_ring != NULL) {
57 gve_dma_free_coherent(&tx->desc_ring_mem);
58 tx->desc_ring = NULL;
59 }
60
61 if (tx->info != NULL) {
62 free(tx->info, M_GVE);
63 tx->info = NULL;
64 }
65 }
66
67 static void
gve_tx_free_ring(struct gve_priv * priv,int i)68 gve_tx_free_ring(struct gve_priv *priv, int i)
69 {
70 struct gve_tx_ring *tx = &priv->tx[i];
71 struct gve_ring_com *com = &tx->com;
72
73 /* Safe to call even if never alloced */
74 gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
75
76 if (mtx_initialized(&tx->ring_mtx))
77 mtx_destroy(&tx->ring_mtx);
78
79 if (com->q_resources != NULL) {
80 gve_dma_free_coherent(&com->q_resources_mem);
81 com->q_resources = NULL;
82 }
83
84 if (tx->br != NULL) {
85 buf_ring_free(tx->br, M_DEVBUF);
86 tx->br = NULL;
87 }
88
89 if (gve_is_gqi(priv))
90 gve_tx_free_ring_gqi(priv, i);
91 else
92 gve_tx_free_ring_dqo(priv, i);
93 }
94
95 static int
gve_tx_alloc_ring_gqi(struct gve_priv * priv,int i)96 gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i)
97 {
98 struct gve_tx_ring *tx = &priv->tx[i];
99 struct gve_ring_com *com = &tx->com;
100 int err;
101
102 err = gve_dma_alloc_coherent(priv,
103 sizeof(union gve_tx_desc) * priv->tx_desc_cnt,
104 CACHE_LINE_SIZE, &tx->desc_ring_mem);
105 if (err != 0) {
106 device_printf(priv->dev,
107 "Failed to alloc desc ring for tx ring %d", i);
108 goto abort;
109 }
110 tx->desc_ring = tx->desc_ring_mem.cpu_addr;
111
112 com->qpl = &priv->qpls[i];
113 if (com->qpl == NULL) {
114 device_printf(priv->dev, "No QPL left for tx ring %d\n", i);
115 err = ENOMEM;
116 goto abort;
117 }
118
119 err = gve_tx_fifo_init(priv, tx);
120 if (err != 0)
121 goto abort;
122
123 tx->info = malloc(
124 sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt,
125 M_GVE, M_WAITOK | M_ZERO);
126 return (0);
127
128 abort:
129 gve_tx_free_ring_gqi(priv, i);
130 return (err);
131 }
132
133 static int
gve_tx_alloc_ring(struct gve_priv * priv,int i)134 gve_tx_alloc_ring(struct gve_priv *priv, int i)
135 {
136 struct gve_tx_ring *tx = &priv->tx[i];
137 struct gve_ring_com *com = &tx->com;
138 char mtx_name[16];
139 int err;
140
141 com->priv = priv;
142 com->id = i;
143
144 if (gve_is_gqi(priv))
145 err = gve_tx_alloc_ring_gqi(priv, i);
146 else
147 err = gve_tx_alloc_ring_dqo(priv, i);
148 if (err != 0)
149 goto abort;
150
151 sprintf(mtx_name, "gvetx%d", i);
152 mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF);
153
154 tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF,
155 M_WAITOK, &tx->ring_mtx);
156
157 gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
158
159 err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources),
160 PAGE_SIZE, &com->q_resources_mem);
161 if (err != 0) {
162 device_printf(priv->dev,
163 "Failed to alloc queue resources for tx ring %d", i);
164 goto abort;
165 }
166 com->q_resources = com->q_resources_mem.cpu_addr;
167
168 return (0);
169
170 abort:
171 gve_tx_free_ring(priv, i);
172 return (err);
173 }
174
175 int
gve_alloc_tx_rings(struct gve_priv * priv)176 gve_alloc_tx_rings(struct gve_priv *priv)
177 {
178 int err = 0;
179 int i;
180
181 priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues,
182 M_GVE, M_WAITOK | M_ZERO);
183
184 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
185 err = gve_tx_alloc_ring(priv, i);
186 if (err != 0)
187 goto free_rings;
188
189 }
190
191 return (0);
192
193 free_rings:
194 while (i--)
195 gve_tx_free_ring(priv, i);
196 free(priv->tx, M_GVE);
197 return (err);
198 }
199
200 void
gve_free_tx_rings(struct gve_priv * priv)201 gve_free_tx_rings(struct gve_priv *priv)
202 {
203 int i;
204
205 for (i = 0; i < priv->tx_cfg.num_queues; i++)
206 gve_tx_free_ring(priv, i);
207
208 free(priv->tx, M_GVE);
209 }
210
211 static void
gve_tx_clear_desc_ring(struct gve_tx_ring * tx)212 gve_tx_clear_desc_ring(struct gve_tx_ring *tx)
213 {
214 struct gve_ring_com *com = &tx->com;
215 int i;
216
217 for (i = 0; i < com->priv->tx_desc_cnt; i++) {
218 tx->desc_ring[i] = (union gve_tx_desc){};
219 tx->info[i] = (struct gve_tx_buffer_state){};
220 }
221
222 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
223 BUS_DMASYNC_PREWRITE);
224 }
225
226 static void
gve_clear_tx_ring(struct gve_priv * priv,int i)227 gve_clear_tx_ring(struct gve_priv *priv, int i)
228 {
229 struct gve_tx_ring *tx = &priv->tx[i];
230 struct gve_tx_fifo *fifo = &tx->fifo;
231
232 tx->req = 0;
233 tx->done = 0;
234 tx->mask = priv->tx_desc_cnt - 1;
235
236 atomic_store_int(&fifo->available, fifo->size);
237 fifo->head = 0;
238
239 gve_tx_clear_desc_ring(tx);
240 }
241
242 static void
gve_start_tx_ring(struct gve_priv * priv,int i)243 gve_start_tx_ring(struct gve_priv *priv, int i)
244 {
245 struct gve_tx_ring *tx = &priv->tx[i];
246 struct gve_ring_com *com = &tx->com;
247
248 atomic_store_bool(&tx->stopped, false);
249 if (gve_is_gqi(priv))
250 NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx);
251 else
252 NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx);
253 com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK,
254 taskqueue_thread_enqueue, &com->cleanup_tq);
255 taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d",
256 device_get_nameunit(priv->dev), i);
257
258 TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx);
259 tx->xmit_tq = taskqueue_create_fast("gve tx xmit",
260 M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq);
261 taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit",
262 device_get_nameunit(priv->dev), i);
263 }
264
265 int
gve_create_tx_rings(struct gve_priv * priv)266 gve_create_tx_rings(struct gve_priv *priv)
267 {
268 struct gve_ring_com *com;
269 struct gve_tx_ring *tx;
270 int err;
271 int i;
272
273 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK))
274 return (0);
275
276 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
277 if (gve_is_gqi(priv))
278 gve_clear_tx_ring(priv, i);
279 else
280 gve_clear_tx_ring_dqo(priv, i);
281 }
282
283 err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues);
284 if (err != 0)
285 return (err);
286
287 bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map,
288 BUS_DMASYNC_POSTREAD);
289
290 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
291 tx = &priv->tx[i];
292 com = &tx->com;
293
294 com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index);
295
296 bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map,
297 BUS_DMASYNC_POSTREAD);
298 com->db_offset = 4 * be32toh(com->q_resources->db_index);
299 com->counter_idx = be32toh(com->q_resources->counter_index);
300
301 gve_start_tx_ring(priv, i);
302 }
303
304 gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
305 return (0);
306 }
307
308 static void
gve_stop_tx_ring(struct gve_priv * priv,int i)309 gve_stop_tx_ring(struct gve_priv *priv, int i)
310 {
311 struct gve_tx_ring *tx = &priv->tx[i];
312 struct gve_ring_com *com = &tx->com;
313
314 if (com->cleanup_tq != NULL) {
315 taskqueue_quiesce(com->cleanup_tq);
316 taskqueue_free(com->cleanup_tq);
317 com->cleanup_tq = NULL;
318 }
319
320 if (tx->xmit_tq != NULL) {
321 taskqueue_quiesce(tx->xmit_tq);
322 taskqueue_free(tx->xmit_tq);
323 tx->xmit_tq = NULL;
324 }
325 }
326
327 int
gve_destroy_tx_rings(struct gve_priv * priv)328 gve_destroy_tx_rings(struct gve_priv *priv)
329 {
330 int err;
331 int i;
332
333 for (i = 0; i < priv->tx_cfg.num_queues; i++)
334 gve_stop_tx_ring(priv, i);
335
336 if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) {
337 err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues);
338 if (err != 0)
339 return (err);
340 gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
341 }
342
343 return (0);
344 }
345
346 int
gve_tx_intr(void * arg)347 gve_tx_intr(void *arg)
348 {
349 struct gve_tx_ring *tx = arg;
350 struct gve_priv *priv = tx->com.priv;
351 struct gve_ring_com *com = &tx->com;
352
353 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
354 return (FILTER_STRAY);
355
356 gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK);
357 taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
358 return (FILTER_HANDLED);
359 }
360
361 static uint32_t
gve_tx_load_event_counter(struct gve_priv * priv,struct gve_tx_ring * tx)362 gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx)
363 {
364 bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map,
365 BUS_DMASYNC_POSTREAD);
366 uint32_t counter = priv->counters[tx->com.counter_idx];
367 return (be32toh(counter));
368 }
369
370 static void
gve_tx_free_fifo(struct gve_tx_fifo * fifo,size_t bytes)371 gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes)
372 {
373 atomic_add_int(&fifo->available, bytes);
374 }
375
376 void
gve_tx_cleanup_tq(void * arg,int pending)377 gve_tx_cleanup_tq(void *arg, int pending)
378 {
379 struct gve_tx_ring *tx = arg;
380 struct gve_priv *priv = tx->com.priv;
381 uint32_t nic_done = gve_tx_load_event_counter(priv, tx);
382 uint32_t todo = nic_done - tx->done;
383 size_t space_freed = 0;
384 int i, j;
385
386 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
387 return;
388
389 for (j = 0; j < todo; j++) {
390 uint32_t idx = tx->done & tx->mask;
391 struct gve_tx_buffer_state *info = &tx->info[idx];
392 struct mbuf *mbuf = info->mbuf;
393
394 tx->done++;
395 if (mbuf == NULL)
396 continue;
397
398 info->mbuf = NULL;
399 counter_enter();
400 counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len);
401 counter_u64_add_protected(tx->stats.tpackets, 1);
402 counter_exit();
403 m_freem(mbuf);
404
405 for (i = 0; i < GVE_TX_MAX_DESCS; i++) {
406 space_freed += info->iov[i].iov_len + info->iov[i].iov_padding;
407 info->iov[i].iov_len = 0;
408 info->iov[i].iov_padding = 0;
409 }
410 }
411
412 gve_tx_free_fifo(&tx->fifo, space_freed);
413
414 gve_db_bar_write_4(priv, tx->com.irq_db_offset,
415 GVE_IRQ_ACK | GVE_IRQ_EVENT);
416
417 /*
418 * Completions born before this barrier MAY NOT cause the NIC to send an
419 * interrupt but they will still be handled by the enqueue below.
420 * Completions born after the barrier WILL trigger an interrupt.
421 */
422 atomic_thread_fence_seq_cst();
423
424 nic_done = gve_tx_load_event_counter(priv, tx);
425 todo = nic_done - tx->done;
426 if (todo != 0) {
427 gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK);
428 taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
429 }
430
431 if (atomic_load_bool(&tx->stopped) && space_freed) {
432 atomic_store_bool(&tx->stopped, false);
433 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
434 }
435 }
436
437 static void
gve_dma_sync_for_device(struct gve_queue_page_list * qpl,uint64_t iov_offset,uint64_t iov_len)438 gve_dma_sync_for_device(struct gve_queue_page_list *qpl,
439 uint64_t iov_offset, uint64_t iov_len)
440 {
441 uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;
442 uint64_t first_page = iov_offset / PAGE_SIZE;
443 struct gve_dma_handle *dma;
444 uint64_t page;
445
446 for (page = first_page; page <= last_page; page++) {
447 dma = &(qpl->dmas[page]);
448 bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE);
449 }
450 }
451
452 static void
gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc * mtd_desc,struct mbuf * mbuf)453 gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf)
454 {
455 mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH;
456 mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4;
457 mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid);
458 mtd_desc->reserved0 = 0;
459 mtd_desc->reserved1 = 0;
460 }
461
462 static void
gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc * pkt_desc,bool is_tso,uint16_t l4_hdr_offset,uint32_t desc_cnt,uint16_t first_seg_len,uint64_t addr,bool has_csum_flag,int csum_offset,uint16_t pkt_len)463 gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso,
464 uint16_t l4_hdr_offset, uint32_t desc_cnt,
465 uint16_t first_seg_len, uint64_t addr, bool has_csum_flag,
466 int csum_offset, uint16_t pkt_len)
467 {
468 if (is_tso) {
469 pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM;
470 pkt_desc->l4_csum_offset = csum_offset >> 1;
471 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
472 } else if (has_csum_flag) {
473 pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM;
474 pkt_desc->l4_csum_offset = csum_offset >> 1;
475 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
476 } else {
477 pkt_desc->type_flags = GVE_TXD_STD;
478 pkt_desc->l4_csum_offset = 0;
479 pkt_desc->l4_hdr_offset = 0;
480 }
481 pkt_desc->desc_cnt = desc_cnt;
482 pkt_desc->len = htobe16(pkt_len);
483 pkt_desc->seg_len = htobe16(first_seg_len);
484 pkt_desc->seg_addr = htobe64(addr);
485 }
486
487 static void
gve_tx_fill_seg_desc(struct gve_tx_seg_desc * seg_desc,bool is_tso,uint16_t len,uint64_t addr,bool is_ipv6,uint8_t l3_off,uint16_t tso_mss)488 gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc,
489 bool is_tso, uint16_t len, uint64_t addr,
490 bool is_ipv6, uint8_t l3_off, uint16_t tso_mss)
491 {
492 seg_desc->type_flags = GVE_TXD_SEG;
493 if (is_tso) {
494 if (is_ipv6)
495 seg_desc->type_flags |= GVE_TXSF_IPV6;
496 seg_desc->l3_offset = l3_off >> 1;
497 seg_desc->mss = htobe16(tso_mss);
498 }
499 seg_desc->seg_len = htobe16(len);
500 seg_desc->seg_addr = htobe64(addr);
501 }
502
503 static inline uint32_t
gve_tx_avail(struct gve_tx_ring * tx)504 gve_tx_avail(struct gve_tx_ring *tx)
505 {
506 return (tx->mask + 1 - (tx->req - tx->done));
507 }
508
509 static bool
gve_tx_fifo_can_alloc(struct gve_tx_fifo * fifo,size_t bytes)510 gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes)
511 {
512 return (atomic_load_int(&fifo->available) >= bytes);
513 }
514
515 static inline bool
gve_can_tx(struct gve_tx_ring * tx,int bytes_required)516 gve_can_tx(struct gve_tx_ring *tx, int bytes_required)
517 {
518 return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) &&
519 gve_tx_fifo_can_alloc(&tx->fifo, bytes_required));
520 }
521
522 static int
gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo * fifo,size_t bytes)523 gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes)
524 {
525 return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head;
526 }
527
528 static inline int
gve_fifo_bytes_required(struct gve_tx_ring * tx,uint16_t first_seg_len,uint16_t pkt_len)529 gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len,
530 uint16_t pkt_len)
531 {
532 int pad_bytes, align_hdr_pad;
533 int bytes;
534
535 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
536 /* We need to take into account the header alignment padding. */
537 align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len;
538 bytes = align_hdr_pad + pad_bytes + pkt_len;
539
540 return (bytes);
541 }
542
543 static int
gve_tx_alloc_fifo(struct gve_tx_fifo * fifo,size_t bytes,struct gve_tx_iovec iov[2])544 gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes,
545 struct gve_tx_iovec iov[2])
546 {
547 size_t overflow, padding;
548 uint32_t aligned_head;
549 int nfrags = 0;
550
551 if (bytes == 0)
552 return (0);
553
554 /*
555 * This check happens before we know how much padding is needed to
556 * align to a cacheline boundary for the payload, but that is fine,
557 * because the FIFO head always start aligned, and the FIFO's boundaries
558 * are aligned, so if there is space for the data, there is space for
559 * the padding to the next alignment.
560 */
561 KASSERT(gve_tx_fifo_can_alloc(fifo, bytes),
562 ("Allocating gve tx fifo when there is no room"));
563
564 nfrags++;
565
566 iov[0].iov_offset = fifo->head;
567 iov[0].iov_len = bytes;
568 fifo->head += bytes;
569
570 if (fifo->head > fifo->size) {
571 /*
572 * If the allocation did not fit in the tail fragment of the
573 * FIFO, also use the head fragment.
574 */
575 nfrags++;
576 overflow = fifo->head - fifo->size;
577 iov[0].iov_len -= overflow;
578 iov[1].iov_offset = 0; /* Start of fifo*/
579 iov[1].iov_len = overflow;
580
581 fifo->head = overflow;
582 }
583
584 /* Re-align to a cacheline boundary */
585 aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE);
586 padding = aligned_head - fifo->head;
587 iov[nfrags - 1].iov_padding = padding;
588 atomic_add_int(&fifo->available, -(bytes + padding));
589 fifo->head = aligned_head;
590
591 if (fifo->head == fifo->size)
592 fifo->head = 0;
593
594 return (nfrags);
595 }
596
597 /* Only error this returns is ENOBUFS when the tx fifo is short of space */
598 static int
gve_xmit(struct gve_tx_ring * tx,struct mbuf * mbuf)599 gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
600 {
601 bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false;
602 int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset;
603 uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len;
604 int pad_bytes, hdr_nfrags, payload_nfrags;
605 struct gve_tx_pkt_desc *pkt_desc;
606 struct gve_tx_seg_desc *seg_desc;
607 struct gve_tx_mtd_desc *mtd_desc;
608 struct gve_tx_buffer_state *info;
609 uint32_t idx = tx->req & tx->mask;
610 struct ether_header *eh;
611 struct mbuf *mbuf_next;
612 int payload_iov = 2;
613 int bytes_required;
614 struct ip6_hdr *ip6;
615 struct tcphdr *th;
616 uint32_t next_idx;
617 uint8_t l3_off;
618 struct ip *ip;
619 int i;
620
621 info = &tx->info[idx];
622 csum_flags = mbuf->m_pkthdr.csum_flags;
623 pkt_len = mbuf->m_pkthdr.len;
624 is_tso = csum_flags & CSUM_TSO;
625 has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
626 CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
627 mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0;
628 tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0;
629
630 eh = mtod(mbuf, struct ether_header *);
631 KASSERT(eh->ether_type != ETHERTYPE_VLAN,
632 ("VLAN-tagged packets not supported"));
633
634 is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6;
635 l3_off = ETHER_HDR_LEN;
636 mbuf_next = m_getptr(mbuf, l3_off, &offset);
637
638 if (is_ipv6) {
639 ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset));
640 l4_off = l3_off + sizeof(struct ip6_hdr);
641 is_tcp = (ip6->ip6_nxt == IPPROTO_TCP);
642 is_udp = (ip6->ip6_nxt == IPPROTO_UDP);
643 mbuf_next = m_getptr(mbuf, l4_off, &offset);
644 } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
645 ip = (struct ip *)(mtodo(mbuf_next, offset));
646 l4_off = l3_off + (ip->ip_hl << 2);
647 is_tcp = (ip->ip_p == IPPROTO_TCP);
648 is_udp = (ip->ip_p == IPPROTO_UDP);
649 mbuf_next = m_getptr(mbuf, l4_off, &offset);
650 }
651
652 l4_data_off = 0;
653 if (is_tcp) {
654 th = (struct tcphdr *)(mtodo(mbuf_next, offset));
655 l4_data_off = l4_off + (th->th_off << 2);
656 } else if (is_udp)
657 l4_data_off = l4_off + sizeof(struct udphdr);
658
659 if (has_csum_flag) {
660 if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0)
661 csum_offset = offsetof(struct tcphdr, th_sum);
662 else
663 csum_offset = offsetof(struct udphdr, uh_sum);
664 }
665
666 /*
667 * If this packet is neither a TCP nor a UDP packet, the first segment,
668 * the one represented by the packet descriptor, will carry the
669 * spec-stipulated minimum of 182B.
670 */
671 if (l4_data_off != 0)
672 first_seg_len = l4_data_off;
673 else
674 first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES);
675
676 bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len);
677 if (__predict_false(!gve_can_tx(tx, bytes_required))) {
678 counter_enter();
679 counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1);
680 counter_exit();
681 return (ENOBUFS);
682 }
683
684 /* So that the cleanup taskqueue can free the mbuf eventually. */
685 info->mbuf = mbuf;
686
687 /*
688 * We don't want to split the header, so if necessary, pad to the end
689 * of the fifo and then put the header at the beginning of the fifo.
690 */
691 pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
692 hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes,
693 &info->iov[0]);
694 KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0"));
695 payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len,
696 &info->iov[payload_iov]);
697
698 pkt_desc = &tx->desc_ring[idx].pkt;
699 gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off,
700 1 + mtd_desc_nr + payload_nfrags, first_seg_len,
701 info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset,
702 pkt_len);
703
704 m_copydata(mbuf, 0, first_seg_len,
705 (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset);
706 gve_dma_sync_for_device(tx->com.qpl,
707 info->iov[hdr_nfrags - 1].iov_offset,
708 info->iov[hdr_nfrags - 1].iov_len);
709 copy_offset = first_seg_len;
710
711 if (mtd_desc_nr == 1) {
712 next_idx = (tx->req + 1) & tx->mask;
713 mtd_desc = &tx->desc_ring[next_idx].mtd;
714 gve_tx_fill_mtd_desc(mtd_desc, mbuf);
715 }
716
717 for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
718 next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;
719 seg_desc = &tx->desc_ring[next_idx].seg;
720
721 gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len,
722 info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss);
723
724 m_copydata(mbuf, copy_offset, info->iov[i].iov_len,
725 (char *)tx->fifo.base + info->iov[i].iov_offset);
726 gve_dma_sync_for_device(tx->com.qpl,
727 info->iov[i].iov_offset, info->iov[i].iov_len);
728 copy_offset += info->iov[i].iov_len;
729 }
730
731 tx->req += (1 + mtd_desc_nr + payload_nfrags);
732 if (is_tso) {
733 counter_enter();
734 counter_u64_add_protected(tx->stats.tso_packet_cnt, 1);
735 counter_exit();
736 }
737 return (0);
738 }
739
740 static int
gve_xmit_mbuf(struct gve_tx_ring * tx,struct mbuf ** mbuf)741 gve_xmit_mbuf(struct gve_tx_ring *tx,
742 struct mbuf **mbuf)
743 {
744 if (gve_is_gqi(tx->com.priv))
745 return (gve_xmit(tx, *mbuf));
746
747 if (gve_is_qpl(tx->com.priv))
748 return (gve_xmit_dqo_qpl(tx, *mbuf));
749
750 /*
751 * gve_xmit_dqo might attempt to defrag the mbuf chain.
752 * The reference is passed in so that in the case of
753 * errors, the new mbuf chain is what's put back on the br.
754 */
755 return (gve_xmit_dqo(tx, mbuf));
756 }
757
758 /*
759 * Has the side-effect of stopping the xmit queue by setting tx->stopped
760 */
761 static int
gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring * tx,struct mbuf ** mbuf)762 gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx,
763 struct mbuf **mbuf)
764 {
765 int err;
766
767 atomic_store_bool(&tx->stopped, true);
768
769 /*
770 * Room made in the queue BEFORE the barrier will be seen by the
771 * gve_xmit_mbuf retry below.
772 *
773 * If room is made in the queue AFTER the barrier, the cleanup tq
774 * iteration creating the room will either see a tx->stopped value
775 * of 0 or the 1 we just wrote:
776 *
777 * If it sees a 1, then it would enqueue the xmit tq. Enqueue
778 * implies a retry on the waiting pkt.
779 *
780 * If it sees a 0, then that implies a previous iteration overwrote
781 * our 1, and that iteration would enqueue the xmit tq. Enqueue
782 * implies a retry on the waiting pkt.
783 */
784 atomic_thread_fence_seq_cst();
785
786 err = gve_xmit_mbuf(tx, mbuf);
787 if (err == 0)
788 atomic_store_bool(&tx->stopped, false);
789
790 return (err);
791 }
792
793 static void
gve_xmit_br(struct gve_tx_ring * tx)794 gve_xmit_br(struct gve_tx_ring *tx)
795 {
796 struct gve_priv *priv = tx->com.priv;
797 struct ifnet *ifp = priv->ifp;
798 struct mbuf *mbuf;
799 int err;
800
801 while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 &&
802 (mbuf = drbr_peek(ifp, tx->br)) != NULL) {
803 err = gve_xmit_mbuf(tx, &mbuf);
804
805 /*
806 * We need to stop this taskqueue when we can't xmit the pkt due
807 * to lack of space in the NIC ring (ENOBUFS). The retry exists
808 * to guard against a TOCTTOU bug that could end up freezing the
809 * queue forever.
810 */
811 if (__predict_false(mbuf != NULL && err == ENOBUFS))
812 err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf);
813
814 if (__predict_false(err != 0 && mbuf != NULL)) {
815 if (err == EINVAL) {
816 drbr_advance(ifp, tx->br);
817 m_freem(mbuf);
818 } else
819 drbr_putback(ifp, tx->br, mbuf);
820 break;
821 }
822
823 drbr_advance(ifp, tx->br);
824 BPF_MTAP(ifp, mbuf);
825
826 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
827 BUS_DMASYNC_PREWRITE);
828
829 if (gve_is_gqi(priv))
830 gve_db_bar_write_4(priv, tx->com.db_offset, tx->req);
831 else
832 gve_db_bar_dqo_write_4(priv, tx->com.db_offset,
833 tx->dqo.desc_tail);
834 }
835 }
836
837 void
gve_xmit_tq(void * arg,int pending)838 gve_xmit_tq(void *arg, int pending)
839 {
840 struct gve_tx_ring *tx = (struct gve_tx_ring *)arg;
841
842 GVE_RING_LOCK(tx);
843 gve_xmit_br(tx);
844 GVE_RING_UNLOCK(tx);
845 }
846
847 static bool
is_vlan_tagged_pkt(struct mbuf * mbuf)848 is_vlan_tagged_pkt(struct mbuf *mbuf)
849 {
850 struct ether_header *eh;
851
852 eh = mtod(mbuf, struct ether_header *);
853 return (ntohs(eh->ether_type) == ETHERTYPE_VLAN);
854 }
855
856 int
gve_xmit_ifp(if_t ifp,struct mbuf * mbuf)857 gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
858 {
859 struct gve_priv *priv = if_getsoftc(ifp);
860 struct gve_tx_ring *tx;
861 bool is_br_empty;
862 int err;
863 uint32_t i;
864
865 if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
866 return (ENODEV);
867
868 if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE)
869 i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues;
870 else
871 i = curcpu % priv->tx_cfg.num_queues;
872 tx = &priv->tx[i];
873
874 if (__predict_false(is_vlan_tagged_pkt(mbuf))) {
875 counter_enter();
876 counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1);
877 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
878 counter_exit();
879 m_freem(mbuf);
880 return (ENODEV);
881 }
882
883 is_br_empty = drbr_empty(ifp, tx->br);
884 err = drbr_enqueue(ifp, tx->br, mbuf);
885 if (__predict_false(err != 0)) {
886 if (!atomic_load_bool(&tx->stopped))
887 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
888 counter_enter();
889 counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1);
890 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
891 counter_exit();
892 return (err);
893 }
894
895 /*
896 * If the mbuf we just enqueued is the only one on the ring, then
897 * transmit it right away in the interests of low latency.
898 */
899 if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) {
900 gve_xmit_br(tx);
901 GVE_RING_UNLOCK(tx);
902 } else if (!atomic_load_bool(&tx->stopped))
903 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
904
905 return (0);
906 }
907
908 void
gve_qflush(if_t ifp)909 gve_qflush(if_t ifp)
910 {
911 struct gve_priv *priv = if_getsoftc(ifp);
912 struct gve_tx_ring *tx;
913 int i;
914
915 for (i = 0; i < priv->tx_cfg.num_queues; ++i) {
916 tx = &priv->tx[i];
917 if (drbr_empty(ifp, tx->br) == 0) {
918 GVE_RING_LOCK(tx);
919 drbr_flush(ifp, tx->br);
920 GVE_RING_UNLOCK(tx);
921 }
922 }
923
924 if_qflush(ifp);
925 }
926