xref: /freebsd/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c (revision 273c26a3c3bea87a241d6879abd4f991db180bf0)
1 /*-
2  * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "en.h"
29 #include <machine/atomic.h>
30 
31 static inline bool
32 mlx5e_do_send_cqe(struct mlx5e_sq *sq)
33 {
34 	sq->cev_counter++;
35 	/* interleave the CQEs */
36 	if (sq->cev_counter >= sq->cev_factor) {
37 		sq->cev_counter = 0;
38 		return (1);
39 	}
40 	return (0);
41 }
42 
43 void
44 mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt)
45 {
46 	u16 pi = sq->pc & sq->wq.sz_m1;
47 	struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
48 
49 	memset(&wqe->ctrl, 0, sizeof(wqe->ctrl));
50 
51 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP);
52 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
53 	if (mlx5e_do_send_cqe(sq))
54 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
55 	else
56 		wqe->ctrl.fm_ce_se = 0;
57 
58 	/* Copy data for doorbell */
59 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
60 
61 	sq->mbuf[pi].mbuf = NULL;
62 	sq->mbuf[pi].num_bytes = 0;
63 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
64 	sq->pc += sq->mbuf[pi].num_wqebbs;
65 }
66 
67 #if (__FreeBSD_version >= 1100000)
68 static uint32_t mlx5e_hash_value;
69 
70 static void
71 mlx5e_hash_init(void *arg)
72 {
73 	mlx5e_hash_value = m_ether_tcpip_hash_init();
74 }
75 
76 /* Make kernel call mlx5e_hash_init after the random stack finished initializing */
77 SYSINIT(mlx5e_hash_init, SI_SUB_RANDOM, SI_ORDER_ANY, &mlx5e_hash_init, NULL);
78 #endif
79 
80 static struct mlx5e_sq *
81 mlx5e_select_queue(struct ifnet *ifp, struct mbuf *mb)
82 {
83 	struct mlx5e_priv *priv = ifp->if_softc;
84 	u32 ch;
85 	u32 tc;
86 
87 	/* check if channels are successfully opened */
88 	if (unlikely(priv->channel == NULL))
89 		return (NULL);
90 
91 	/* obtain VLAN information if present */
92 	if (mb->m_flags & M_VLANTAG) {
93 		tc = (mb->m_pkthdr.ether_vtag >> 13);
94 		if (tc >= priv->num_tc)
95 			tc = priv->default_vlan_prio;
96 	} else {
97 		tc = priv->default_vlan_prio;
98 	}
99 
100 	ch = priv->params.num_channels;
101 
102 	/* check if flowid is set */
103 	if (M_HASHTYPE_GET(mb) != M_HASHTYPE_NONE) {
104 #ifdef RSS
105 		u32 temp;
106 
107 		if (rss_hash2bucket(mb->m_pkthdr.flowid,
108 		    M_HASHTYPE_GET(mb), &temp) == 0)
109 			ch = temp % ch;
110 		else
111 #endif
112 			ch = (mb->m_pkthdr.flowid % 128) % ch;
113 	} else {
114 #if (__FreeBSD_version >= 1100000)
115 		ch = m_ether_tcpip_hash(MBUF_HASHFLAG_L3 |
116 		    MBUF_HASHFLAG_L4, mb, mlx5e_hash_value) % ch;
117 #else
118 		/*
119 		 * m_ether_tcpip_hash not present in stable, so just
120 		 * throw unhashed mbufs on queue 0
121 		 */
122 		ch = 0;
123 #endif
124 	}
125 
126 	/* check if channel is allocated */
127 	if (unlikely(priv->channel[ch] == NULL))
128 		return (NULL);
129 
130 	return (&priv->channel[ch]->sq[tc]);
131 }
132 
133 static inline u16
134 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq, struct mbuf *mb)
135 {
136 	return (MIN(MLX5E_MAX_TX_INLINE, mb->m_len));
137 }
138 
139 static int
140 mlx5e_get_header_size(struct mbuf *mb)
141 {
142 	struct ether_vlan_header *eh;
143 	struct tcphdr *th;
144 	struct ip *ip;
145 	int ip_hlen, tcp_hlen;
146 	struct ip6_hdr *ip6;
147 	uint16_t eth_type;
148 	int eth_hdr_len;
149 
150 	eh = mtod(mb, struct ether_vlan_header *);
151 	if (mb->m_len < ETHER_HDR_LEN)
152 		return (0);
153 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
154 		eth_type = ntohs(eh->evl_proto);
155 		eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
156 	} else {
157 		eth_type = ntohs(eh->evl_encap_proto);
158 		eth_hdr_len = ETHER_HDR_LEN;
159 	}
160 	if (mb->m_len < eth_hdr_len)
161 		return (0);
162 	switch (eth_type) {
163 	case ETHERTYPE_IP:
164 		ip = (struct ip *)(mb->m_data + eth_hdr_len);
165 		if (mb->m_len < eth_hdr_len + sizeof(*ip))
166 			return (0);
167 		if (ip->ip_p != IPPROTO_TCP)
168 			return (0);
169 		ip_hlen = ip->ip_hl << 2;
170 		eth_hdr_len += ip_hlen;
171 		break;
172 	case ETHERTYPE_IPV6:
173 		ip6 = (struct ip6_hdr *)(mb->m_data + eth_hdr_len);
174 		if (mb->m_len < eth_hdr_len + sizeof(*ip6))
175 			return (0);
176 		if (ip6->ip6_nxt != IPPROTO_TCP)
177 			return (0);
178 		eth_hdr_len += sizeof(*ip6);
179 		break;
180 	default:
181 		return (0);
182 	}
183 	if (mb->m_len < eth_hdr_len + sizeof(*th))
184 		return (0);
185 	th = (struct tcphdr *)(mb->m_data + eth_hdr_len);
186 	tcp_hlen = th->th_off << 2;
187 	eth_hdr_len += tcp_hlen;
188 	if (mb->m_len < eth_hdr_len)
189 		return (0);
190 	return (eth_hdr_len);
191 }
192 
193 /*
194  * The return value is not going back to the stack because of
195  * the drbr
196  */
197 static int
198 mlx5e_sq_xmit(struct mlx5e_sq *sq, struct mbuf **mbp)
199 {
200 	bus_dma_segment_t segs[MLX5E_MAX_TX_MBUF_FRAGS];
201 	struct mlx5_wqe_data_seg *dseg;
202 	struct mlx5e_tx_wqe *wqe;
203 	struct ifnet *ifp;
204 	int nsegs;
205 	int err;
206 	int x;
207 	struct mbuf *mb = *mbp;
208 	u16 ds_cnt;
209 	u16 ihs;
210 	u16 pi;
211 	u8 opcode;
212 
213 	/*
214 	 * Return ENOBUFS if the queue is full, this may trigger reinsertion
215 	 * of the mbuf into the drbr (see mlx5e_xmit_locked)
216 	 */
217 	if (unlikely(!mlx5e_sq_has_room_for(sq, 2 * MLX5_SEND_WQE_MAX_WQEBBS))) {
218 		return (ENOBUFS);
219 	}
220 
221 	/* Align SQ edge with NOPs to avoid WQE wrap around */
222 	pi = ((~sq->pc) & sq->wq.sz_m1);
223 	if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1)) {
224 		/* Send one multi NOP message instead of many */
225 		mlx5e_send_nop(sq, (pi + 1) * MLX5_SEND_WQEBB_NUM_DS);
226 		pi = ((~sq->pc) & sq->wq.sz_m1);
227 		if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1))
228 			return (ENOMEM);
229 	}
230 
231 	/* Setup local variables */
232 	pi = sq->pc & sq->wq.sz_m1;
233 	wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
234 	ifp = sq->ifp;
235 
236 	memset(wqe, 0, sizeof(*wqe));
237 
238 	/* Send a copy of the frame to the BPF listener, if any */
239 	if (ifp != NULL && ifp->if_bpf != NULL)
240 		ETHER_BPF_MTAP(ifp, mb);
241 
242 	if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) {
243 		wqe->eth.cs_flags |= MLX5_ETH_WQE_L3_CSUM;
244 	}
245 	if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) {
246 		wqe->eth.cs_flags |= MLX5_ETH_WQE_L4_CSUM;
247 	}
248 	if (wqe->eth.cs_flags == 0) {
249 		sq->stats.csum_offload_none++;
250 	}
251 	if (mb->m_pkthdr.csum_flags & CSUM_TSO) {
252 		u32 payload_len;
253 		u32 mss = mb->m_pkthdr.tso_segsz;
254 		u32 num_pkts;
255 
256 		wqe->eth.mss = cpu_to_be16(mss);
257 		opcode = MLX5_OPCODE_LSO;
258 		ihs = mlx5e_get_header_size(mb);
259 		payload_len = mb->m_pkthdr.len - ihs;
260 		if (payload_len == 0)
261 			num_pkts = 1;
262 		else
263 			num_pkts = DIV_ROUND_UP(payload_len, mss);
264 		sq->mbuf[pi].num_bytes = payload_len + (num_pkts * ihs);
265 
266 		sq->stats.tso_packets++;
267 		sq->stats.tso_bytes += payload_len;
268 	} else {
269 		opcode = MLX5_OPCODE_SEND;
270 		ihs = mlx5e_get_inline_hdr_size(sq, mb);
271 		sq->mbuf[pi].num_bytes = max_t (unsigned int,
272 		    mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN);
273 	}
274 	if (mb->m_flags & M_VLANTAG) {
275 		struct ether_vlan_header *eh =
276 		    (struct ether_vlan_header *)wqe->eth.inline_hdr_start;
277 
278 		/* Range checks */
279 		if (ihs > (MLX5E_MAX_TX_INLINE - ETHER_VLAN_ENCAP_LEN))
280 			ihs = (MLX5E_MAX_TX_INLINE - ETHER_VLAN_ENCAP_LEN);
281 		else if (ihs < ETHER_HDR_LEN) {
282 			err = EINVAL;
283 			goto tx_drop;
284 		}
285 		m_copydata(mb, 0, ETHER_HDR_LEN, (caddr_t)eh);
286 		m_adj(mb, ETHER_HDR_LEN);
287 		/* Insert 4 bytes VLAN tag into data stream */
288 		eh->evl_proto = eh->evl_encap_proto;
289 		eh->evl_encap_proto = htons(ETHERTYPE_VLAN);
290 		eh->evl_tag = htons(mb->m_pkthdr.ether_vtag);
291 		/* Copy rest of header data, if any */
292 		m_copydata(mb, 0, ihs - ETHER_HDR_LEN, (caddr_t)(eh + 1));
293 		m_adj(mb, ihs - ETHER_HDR_LEN);
294 		/* Extend header by 4 bytes */
295 		ihs += ETHER_VLAN_ENCAP_LEN;
296 	} else {
297 		m_copydata(mb, 0, ihs, wqe->eth.inline_hdr_start);
298 		m_adj(mb, ihs);
299 	}
300 
301 	wqe->eth.inline_hdr_sz = cpu_to_be16(ihs);
302 
303 	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
304 	if (likely(ihs > sizeof(wqe->eth.inline_hdr_start))) {
305 		ds_cnt += DIV_ROUND_UP(ihs - sizeof(wqe->eth.inline_hdr_start),
306 		    MLX5_SEND_WQE_DS);
307 	}
308 	dseg = ((struct mlx5_wqe_data_seg *)&wqe->ctrl) + ds_cnt;
309 
310 	/* Trim off empty mbufs */
311 	while (mb->m_len == 0) {
312 		mb = m_free(mb);
313 		/* Check if all data has been inlined */
314 		if (mb == NULL)
315 			goto skip_dma;
316 	}
317 
318 	err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map,
319 	    mb, segs, &nsegs, BUS_DMA_NOWAIT);
320 	if (err == EFBIG) {
321 		/*
322 		 * Update *mbp before defrag in case it was trimmed in the
323 		 * loop above
324 		 */
325 		*mbp = mb;
326 		/* Update statistics */
327 		sq->stats.defragged++;
328 		/* Too many mbuf fragments */
329 		mb = m_defrag(*mbp, M_NOWAIT);
330 		if (mb == NULL) {
331 			mb = *mbp;
332 			goto tx_drop;
333 		}
334 		/* Try again */
335 		err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map,
336 		    mb, segs, &nsegs, BUS_DMA_NOWAIT);
337 	}
338 	/* Catch errors */
339 	if (err != 0)
340 		goto tx_drop;
341 
342 	for (x = 0; x != nsegs; x++) {
343 		if (segs[x].ds_len == 0)
344 			continue;
345 		dseg->addr = cpu_to_be64((uint64_t)segs[x].ds_addr);
346 		dseg->lkey = sq->mkey_be;
347 		dseg->byte_count = cpu_to_be32((uint32_t)segs[x].ds_len);
348 		dseg++;
349 	}
350 skip_dma:
351 	ds_cnt = (dseg - ((struct mlx5_wqe_data_seg *)&wqe->ctrl));
352 
353 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
354 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
355 	if (mlx5e_do_send_cqe(sq))
356 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
357 	else
358 		wqe->ctrl.fm_ce_se = 0;
359 
360 	/* Copy data for doorbell */
361 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
362 
363 	/* Store pointer to mbuf */
364 	sq->mbuf[pi].mbuf = mb;
365 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
366 	sq->pc += sq->mbuf[pi].num_wqebbs;
367 
368 	/* Make sure all mbuf data is written to RAM */
369 	if (mb != NULL)
370 		bus_dmamap_sync(sq->dma_tag, sq->mbuf[pi].dma_map, BUS_DMASYNC_PREWRITE);
371 
372 	sq->stats.packets++;
373 	*mbp = NULL;	/* safety clear */
374 	return (0);
375 
376 tx_drop:
377 	sq->stats.dropped++;
378 	*mbp = NULL;
379 	m_freem(mb);
380 	return err;
381 }
382 
383 static void
384 mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget)
385 {
386 	u16 sqcc;
387 
388 	/*
389 	 * sq->cc must be updated only after mlx5_cqwq_update_db_record(),
390 	 * otherwise a cq overrun may occur
391 	 */
392 	sqcc = sq->cc;
393 
394 	while (budget > 0) {
395 		struct mlx5_cqe64 *cqe;
396 		struct mbuf *mb;
397 		u16 x;
398 		u16 ci;
399 
400 		cqe = mlx5e_get_cqe(&sq->cq);
401 		if (!cqe)
402 			break;
403 
404 		mlx5_cqwq_pop(&sq->cq.wq);
405 
406 		/* update budget according to the event factor */
407 		budget -= sq->cev_factor;
408 
409 		for (x = 0; x != sq->cev_factor; x++) {
410 			ci = sqcc & sq->wq.sz_m1;
411 			mb = sq->mbuf[ci].mbuf;
412 			sq->mbuf[ci].mbuf = NULL;	/* Safety clear */
413 
414 			if (mb == NULL) {
415 				if (sq->mbuf[ci].num_bytes == 0) {
416 					/* NOP */
417 					sq->stats.nop++;
418 				}
419 			} else {
420 				bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
421 				    BUS_DMASYNC_POSTWRITE);
422 				bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
423 
424 				/* Free transmitted mbuf */
425 				m_freem(mb);
426 			}
427 			sqcc += sq->mbuf[ci].num_wqebbs;
428 		}
429 	}
430 
431 	mlx5_cqwq_update_db_record(&sq->cq.wq);
432 
433 	/* Ensure cq space is freed before enabling more cqes */
434 	wmb();
435 
436 	sq->cc = sqcc;
437 
438 	if (atomic_cmpset_int(&sq->queue_state, MLX5E_SQ_FULL, MLX5E_SQ_READY))
439 		taskqueue_enqueue(sq->sq_tq, &sq->sq_task);
440 }
441 
442 static int
443 mlx5e_xmit_locked(struct ifnet *ifp, struct mlx5e_sq *sq, struct mbuf *mb)
444 {
445 	struct mbuf *next;
446 	int err = 0;
447 
448 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
449 		if (mb)
450 			err = drbr_enqueue(ifp, sq->br, mb);
451 		return (err);
452 	}
453 
454 	if (mb != NULL)
455 		/*
456 		 * If we can't insert mbuf into drbr, try to xmit anyway.
457 		 * We keep the error we got so we could return that after xmit.
458 		 */
459 		err = drbr_enqueue(ifp, sq->br, mb);
460 
461 	/* Process the queue */
462 	while ((next = drbr_peek(ifp, sq->br)) != NULL) {
463 		if (mlx5e_sq_xmit(sq, &next) != 0) {
464 			if (next == NULL) {
465 				drbr_advance(ifp, sq->br);
466 			} else {
467 				drbr_putback(ifp, sq->br, next);
468 				atomic_store_rel_int(&sq->queue_state, MLX5E_SQ_FULL);
469 			}
470 			break;
471 		}
472 		drbr_advance(ifp, sq->br);
473 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
474 			break;
475 	}
476 	/* Check if we need to write the doorbell */
477 	if (likely(sq->doorbell.d64 != 0)) {
478 		mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0);
479 		sq->doorbell.d64 = 0;
480 	}
481 	/*
482 	 * Check if we need to start the event timer which flushes the
483 	 * transmit ring on timeout:
484 	 */
485 	if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL &&
486 	    sq->cev_factor != 1)) {
487 		/* start the timer */
488 		mlx5e_sq_cev_timeout(sq);
489 	} else {
490 		/* don't send NOPs yet */
491 		sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
492 	}
493 	return (err);
494 }
495 
496 int
497 mlx5e_xmit(struct ifnet *ifp, struct mbuf *mb)
498 {
499 	struct mlx5e_sq *sq;
500 	int ret;
501 
502 	sq = mlx5e_select_queue(ifp, mb);
503 	if (unlikely(sq == NULL)) {
504 		/* Invalid send queue */
505 		m_freem(mb);
506 		return (ENXIO);
507 	}
508 	if (mtx_trylock(&sq->lock)) {
509 		ret = mlx5e_xmit_locked(ifp, sq, mb);
510 		mtx_unlock(&sq->lock);
511 	} else {
512 		ret = drbr_enqueue(ifp, sq->br, mb);
513 		taskqueue_enqueue(sq->sq_tq, &sq->sq_task);
514 	}
515 
516 	return (ret);
517 }
518 
519 void
520 mlx5e_tx_cq_comp(struct mlx5_core_cq *mcq)
521 {
522 	struct mlx5e_sq *sq = container_of(mcq, struct mlx5e_sq, cq.mcq);
523 
524 	mtx_lock(&sq->comp_lock);
525 	mlx5e_poll_tx_cq(sq, MLX5E_BUDGET_MAX);
526 	mlx5e_cq_arm(&sq->cq);
527 	mtx_unlock(&sq->comp_lock);
528 }
529 
530 void
531 mlx5e_tx_que(void *context, int pending)
532 {
533 	struct mlx5e_sq *sq = context;
534 	struct ifnet *ifp = sq->ifp;
535 
536 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
537 		mtx_lock(&sq->lock);
538 		if (!drbr_empty(ifp, sq->br))
539 			mlx5e_xmit_locked(ifp, sq, NULL);
540 		mtx_unlock(&sq->lock);
541 	}
542 }
543