xref: /freebsd/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c (revision ee5cf11617a9b7f034d95c639bd4d27d1f09e848)
1 /*-
2  * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "en.h"
29 #include <machine/atomic.h>
30 
31 static inline bool
32 mlx5e_do_send_cqe(struct mlx5e_sq *sq)
33 {
34 	sq->cev_counter++;
35 	/* interleave the CQEs */
36 	if (sq->cev_counter >= sq->cev_factor) {
37 		sq->cev_counter = 0;
38 		return (1);
39 	}
40 	return (0);
41 }
42 
43 void
44 mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt)
45 {
46 	u16 pi = sq->pc & sq->wq.sz_m1;
47 	struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
48 
49 	memset(&wqe->ctrl, 0, sizeof(wqe->ctrl));
50 
51 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP);
52 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
53 	if (mlx5e_do_send_cqe(sq))
54 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
55 	else
56 		wqe->ctrl.fm_ce_se = 0;
57 
58 	/* Copy data for doorbell */
59 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
60 
61 	sq->mbuf[pi].mbuf = NULL;
62 	sq->mbuf[pi].num_bytes = 0;
63 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
64 	sq->pc += sq->mbuf[pi].num_wqebbs;
65 }
66 
67 #if (__FreeBSD_version >= 1100000)
68 static uint32_t mlx5e_hash_value;
69 
70 static void
71 mlx5e_hash_init(void *arg)
72 {
73 	mlx5e_hash_value = m_ether_tcpip_hash_init();
74 }
75 
76 /* Make kernel call mlx5e_hash_init after the random stack finished initializing */
77 SYSINIT(mlx5e_hash_init, SI_SUB_RANDOM, SI_ORDER_ANY, &mlx5e_hash_init, NULL);
78 #endif
79 
80 static struct mlx5e_sq *
81 mlx5e_select_queue(struct ifnet *ifp, struct mbuf *mb)
82 {
83 	struct mlx5e_priv *priv = ifp->if_softc;
84 	u32 ch;
85 	u32 tc;
86 
87 	/* check if channels are successfully opened */
88 	if (unlikely(priv->channel == NULL))
89 		return (NULL);
90 
91 	/* obtain VLAN information if present */
92 	if (mb->m_flags & M_VLANTAG) {
93 		tc = (mb->m_pkthdr.ether_vtag >> 13);
94 		if (tc >= priv->num_tc)
95 			tc = priv->default_vlan_prio;
96 	} else {
97 		tc = priv->default_vlan_prio;
98 	}
99 
100 	ch = priv->params.num_channels;
101 
102 	/* check if flowid is set */
103 	if (M_HASHTYPE_GET(mb) != M_HASHTYPE_NONE) {
104 #ifdef RSS
105 		u32 temp;
106 
107 		if (rss_hash2bucket(mb->m_pkthdr.flowid,
108 		    M_HASHTYPE_GET(mb), &temp) == 0)
109 			ch = temp % ch;
110 		else
111 #endif
112 			ch = (mb->m_pkthdr.flowid % 128) % ch;
113 	} else {
114 #if (__FreeBSD_version >= 1100000)
115 		ch = m_ether_tcpip_hash(MBUF_HASHFLAG_L3 |
116 		    MBUF_HASHFLAG_L4, mb, mlx5e_hash_value) % ch;
117 #else
118 		/*
119 		 * m_ether_tcpip_hash not present in stable, so just
120 		 * throw unhashed mbufs on queue 0
121 		 */
122 		ch = 0;
123 #endif
124 	}
125 
126 	/* check if channel is allocated */
127 	if (unlikely(priv->channel[ch] == NULL))
128 		return (NULL);
129 
130 	return (&priv->channel[ch]->sq[tc]);
131 }
132 
133 static inline u16
134 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq, struct mbuf *mb)
135 {
136 	return (MIN(MLX5E_MAX_TX_INLINE, mb->m_len));
137 }
138 
139 static int
140 mlx5e_get_header_size(struct mbuf *mb)
141 {
142 	struct ether_vlan_header *eh;
143 	struct tcphdr *th;
144 	struct ip *ip;
145 	int ip_hlen, tcp_hlen;
146 	struct ip6_hdr *ip6;
147 	uint16_t eth_type;
148 	int eth_hdr_len;
149 
150 	eh = mtod(mb, struct ether_vlan_header *);
151 	if (mb->m_len < ETHER_HDR_LEN)
152 		return (0);
153 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
154 		eth_type = ntohs(eh->evl_proto);
155 		eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
156 	} else {
157 		eth_type = ntohs(eh->evl_encap_proto);
158 		eth_hdr_len = ETHER_HDR_LEN;
159 	}
160 	if (mb->m_len < eth_hdr_len)
161 		return (0);
162 	switch (eth_type) {
163 	case ETHERTYPE_IP:
164 		ip = (struct ip *)(mb->m_data + eth_hdr_len);
165 		if (mb->m_len < eth_hdr_len + sizeof(*ip))
166 			return (0);
167 		if (ip->ip_p != IPPROTO_TCP)
168 			return (0);
169 		ip_hlen = ip->ip_hl << 2;
170 		eth_hdr_len += ip_hlen;
171 		break;
172 	case ETHERTYPE_IPV6:
173 		ip6 = (struct ip6_hdr *)(mb->m_data + eth_hdr_len);
174 		if (mb->m_len < eth_hdr_len + sizeof(*ip6))
175 			return (0);
176 		if (ip6->ip6_nxt != IPPROTO_TCP)
177 			return (0);
178 		eth_hdr_len += sizeof(*ip6);
179 		break;
180 	default:
181 		return (0);
182 	}
183 	if (mb->m_len < eth_hdr_len + sizeof(*th))
184 		return (0);
185 	th = (struct tcphdr *)(mb->m_data + eth_hdr_len);
186 	tcp_hlen = th->th_off << 2;
187 	eth_hdr_len += tcp_hlen;
188 	if (mb->m_len < eth_hdr_len)
189 		return (0);
190 	return (eth_hdr_len);
191 }
192 
193 /*
194  * The return value is not going back to the stack because of
195  * the drbr
196  */
197 static int
198 mlx5e_sq_xmit(struct mlx5e_sq *sq, struct mbuf **mbp)
199 {
200 	bus_dma_segment_t segs[MLX5E_MAX_TX_MBUF_FRAGS];
201 	struct mlx5_wqe_data_seg *dseg;
202 	struct mlx5e_tx_wqe *wqe;
203 	struct ifnet *ifp;
204 	int nsegs;
205 	int err;
206 	int x;
207 	struct mbuf *mb = *mbp;
208 	u16 ds_cnt;
209 	u16 ihs;
210 	u16 pi;
211 	u8 opcode;
212 
213 	/*
214 	 * Return ENOBUFS if the queue is full, this may trigger reinsertion
215 	 * of the mbuf into the drbr (see mlx5e_xmit_locked)
216 	 */
217 	if (unlikely(!mlx5e_sq_has_room_for(sq, 2 * MLX5_SEND_WQE_MAX_WQEBBS))) {
218 		return (ENOBUFS);
219 	}
220 
221 	/* Align SQ edge with NOPs to avoid WQE wrap around */
222 	pi = ((~sq->pc) & sq->wq.sz_m1);
223 	if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1)) {
224 		/* Send one multi NOP message instead of many */
225 		mlx5e_send_nop(sq, (pi + 1) * MLX5_SEND_WQEBB_NUM_DS);
226 		pi = ((~sq->pc) & sq->wq.sz_m1);
227 		if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1)) {
228 			m_freem(mb);
229 			return (ENOMEM);
230 		}
231 	}
232 
233 	/* Setup local variables */
234 	pi = sq->pc & sq->wq.sz_m1;
235 	wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
236 	ifp = sq->channel->ifp;
237 
238 	memset(wqe, 0, sizeof(*wqe));
239 
240 	/* Send a copy of the frame to the BPF listener, if any */
241 	if (ifp != NULL && ifp->if_bpf != NULL)
242 		ETHER_BPF_MTAP(ifp, mb);
243 
244 	if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) {
245 		wqe->eth.cs_flags |= MLX5_ETH_WQE_L3_CSUM;
246 	}
247 	if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) {
248 		wqe->eth.cs_flags |= MLX5_ETH_WQE_L4_CSUM;
249 	}
250 	if (wqe->eth.cs_flags == 0) {
251 		sq->stats.csum_offload_none++;
252 	}
253 	if (mb->m_pkthdr.csum_flags & CSUM_TSO) {
254 		u32 payload_len;
255 		u32 mss = mb->m_pkthdr.tso_segsz;
256 		u32 num_pkts;
257 
258 		wqe->eth.mss = cpu_to_be16(mss);
259 		opcode = MLX5_OPCODE_LSO;
260 		ihs = mlx5e_get_header_size(mb);
261 		payload_len = mb->m_pkthdr.len - ihs;
262 		if (payload_len == 0)
263 			num_pkts = 1;
264 		else
265 			num_pkts = DIV_ROUND_UP(payload_len, mss);
266 		sq->mbuf[pi].num_bytes = payload_len + (num_pkts * ihs);
267 
268 		sq->stats.tso_packets++;
269 		sq->stats.tso_bytes += payload_len;
270 	} else {
271 		opcode = MLX5_OPCODE_SEND;
272 		ihs = mlx5e_get_inline_hdr_size(sq, mb);
273 		sq->mbuf[pi].num_bytes = max_t (unsigned int,
274 		    mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN);
275 	}
276 	if (mb->m_flags & M_VLANTAG) {
277 		struct ether_vlan_header *eh =
278 		    (struct ether_vlan_header *)wqe->eth.inline_hdr_start;
279 
280 		/* Range checks */
281 		if (ihs > (MLX5E_MAX_TX_INLINE - ETHER_VLAN_ENCAP_LEN))
282 			ihs = (MLX5E_MAX_TX_INLINE - ETHER_VLAN_ENCAP_LEN);
283 		else if (ihs < ETHER_HDR_LEN) {
284 			err = EINVAL;
285 			goto tx_drop;
286 		}
287 		m_copydata(mb, 0, ETHER_HDR_LEN, (caddr_t)eh);
288 		m_adj(mb, ETHER_HDR_LEN);
289 		/* Insert 4 bytes VLAN tag into data stream */
290 		eh->evl_proto = eh->evl_encap_proto;
291 		eh->evl_encap_proto = htons(ETHERTYPE_VLAN);
292 		eh->evl_tag = htons(mb->m_pkthdr.ether_vtag);
293 		/* Copy rest of header data, if any */
294 		m_copydata(mb, 0, ihs - ETHER_HDR_LEN, (caddr_t)(eh + 1));
295 		m_adj(mb, ihs - ETHER_HDR_LEN);
296 		/* Extend header by 4 bytes */
297 		ihs += ETHER_VLAN_ENCAP_LEN;
298 	} else {
299 		m_copydata(mb, 0, ihs, wqe->eth.inline_hdr_start);
300 		m_adj(mb, ihs);
301 	}
302 
303 	wqe->eth.inline_hdr_sz = cpu_to_be16(ihs);
304 
305 	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
306 	if (likely(ihs > sizeof(wqe->eth.inline_hdr_start))) {
307 		ds_cnt += DIV_ROUND_UP(ihs - sizeof(wqe->eth.inline_hdr_start),
308 		    MLX5_SEND_WQE_DS);
309 	}
310 	dseg = ((struct mlx5_wqe_data_seg *)&wqe->ctrl) + ds_cnt;
311 
312 	/* Trim off empty mbufs */
313 	while (mb->m_len == 0) {
314 		mb = m_free(mb);
315 		/* Check if all data has been inlined */
316 		if (mb == NULL)
317 			goto skip_dma;
318 	}
319 
320 	err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map,
321 	    mb, segs, &nsegs, BUS_DMA_NOWAIT);
322 	if (err == EFBIG) {
323 		/*
324 		 * Update *mbp before defrag in case it was trimmed in the
325 		 * loop above
326 		 */
327 		*mbp = mb;
328 		/* Update statistics */
329 		sq->stats.defragged++;
330 		/* Too many mbuf fragments */
331 		mb = m_defrag(*mbp, M_NOWAIT);
332 		if (mb == NULL) {
333 			mb = *mbp;
334 			goto tx_drop;
335 		}
336 		/* Try again */
337 		err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map,
338 		    mb, segs, &nsegs, BUS_DMA_NOWAIT);
339 	}
340 	/* Catch errors */
341 	if (err != 0) {
342 		goto tx_drop;
343 	}
344 	*mbp = mb;
345 
346 	for (x = 0; x != nsegs; x++) {
347 		if (segs[x].ds_len == 0)
348 			continue;
349 		dseg->addr = cpu_to_be64((uint64_t)segs[x].ds_addr);
350 		dseg->lkey = sq->mkey_be;
351 		dseg->byte_count = cpu_to_be32((uint32_t)segs[x].ds_len);
352 		dseg++;
353 	}
354 skip_dma:
355 	ds_cnt = (dseg - ((struct mlx5_wqe_data_seg *)&wqe->ctrl));
356 
357 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
358 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
359 	if (mlx5e_do_send_cqe(sq))
360 		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
361 	else
362 		wqe->ctrl.fm_ce_se = 0;
363 
364 	/* Copy data for doorbell */
365 	memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32));
366 
367 	/* Store pointer to mbuf */
368 	sq->mbuf[pi].mbuf = mb;
369 	sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
370 	sq->pc += sq->mbuf[pi].num_wqebbs;
371 
372 	/* Make sure all mbuf data is written to RAM */
373 	if (mb != NULL)
374 		bus_dmamap_sync(sq->dma_tag, sq->mbuf[pi].dma_map, BUS_DMASYNC_PREWRITE);
375 
376 	sq->stats.packets++;
377 	return (0);
378 
379 tx_drop:
380 	sq->stats.dropped++;
381 	*mbp = NULL;
382 	m_freem(mb);
383 	return err;
384 }
385 
386 static void
387 mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget)
388 {
389 	u16 sqcc;
390 
391 	/*
392 	 * sq->cc must be updated only after mlx5_cqwq_update_db_record(),
393 	 * otherwise a cq overrun may occur
394 	 */
395 	sqcc = sq->cc;
396 
397 	while (budget > 0) {
398 		struct mlx5_cqe64 *cqe;
399 		struct mbuf *mb;
400 		u16 x;
401 		u16 ci;
402 
403 		cqe = mlx5e_get_cqe(&sq->cq);
404 		if (!cqe)
405 			break;
406 
407 		mlx5_cqwq_pop(&sq->cq.wq);
408 
409 		/* update budget according to the event factor */
410 		budget -= sq->cev_factor;
411 
412 		for (x = 0; x != sq->cev_factor; x++) {
413 			ci = sqcc & sq->wq.sz_m1;
414 			mb = sq->mbuf[ci].mbuf;
415 			sq->mbuf[ci].mbuf = NULL;	/* Safety clear */
416 
417 			if (mb == NULL) {
418 				if (sq->mbuf[ci].num_bytes == 0) {
419 					/* NOP */
420 					sq->stats.nop++;
421 				}
422 			} else {
423 				bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
424 				    BUS_DMASYNC_POSTWRITE);
425 				bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
426 
427 				/* Free transmitted mbuf */
428 				m_freem(mb);
429 			}
430 			sqcc += sq->mbuf[ci].num_wqebbs;
431 		}
432 	}
433 
434 	mlx5_cqwq_update_db_record(&sq->cq.wq);
435 
436 	/* Ensure cq space is freed before enabling more cqes */
437 	wmb();
438 
439 	sq->cc = sqcc;
440 
441 	if (atomic_cmpset_int(&sq->queue_state, MLX5E_SQ_FULL, MLX5E_SQ_READY))
442 		taskqueue_enqueue(sq->sq_tq, &sq->sq_task);
443 }
444 
445 static int
446 mlx5e_xmit_locked(struct ifnet *ifp, struct mlx5e_sq *sq, struct mbuf *mb)
447 {
448 	struct mbuf *next;
449 	int err = 0;
450 
451 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
452 		if (mb)
453 			err = drbr_enqueue(ifp, sq->br, mb);
454 		return (err);
455 	}
456 
457 	if (mb != NULL)
458 		/*
459 		 * If we can't insert mbuf into drbr, try to xmit anyway.
460 		 * We keep the error we got so we could return that after xmit.
461 		 */
462 		err = drbr_enqueue(ifp, sq->br, mb);
463 
464 	/* Process the queue */
465 	while ((next = drbr_peek(ifp, sq->br)) != NULL) {
466 		if (mlx5e_sq_xmit(sq, &next) != 0) {
467 			if (next == NULL) {
468 				drbr_advance(ifp, sq->br);
469 			} else {
470 				drbr_putback(ifp, sq->br, next);
471 				atomic_store_rel_int(&sq->queue_state, MLX5E_SQ_FULL);
472 			}
473 			break;
474 		}
475 		drbr_advance(ifp, sq->br);
476 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
477 			break;
478 	}
479 	/* Check if we need to write the doorbell */
480 	if (likely(sq->doorbell.d64 != 0)) {
481 		mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0);
482 		sq->doorbell.d64 = 0;
483 	}
484 	/*
485 	 * Check if we need to start the event timer which flushes the
486 	 * transmit ring on timeout:
487 	 */
488 	if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL &&
489 	    sq->cev_factor != 1)) {
490 		/* start the timer */
491 		mlx5e_sq_cev_timeout(sq);
492 	} else {
493 		/* don't send NOPs yet */
494 		sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
495 	}
496 	return (err);
497 }
498 
499 int
500 mlx5e_xmit(struct ifnet *ifp, struct mbuf *mb)
501 {
502 	struct mlx5e_sq *sq;
503 	int ret;
504 
505 	sq = mlx5e_select_queue(ifp, mb);
506 	if (unlikely(sq == NULL)) {
507 		/* Invalid send queue */
508 		m_freem(mb);
509 		return (ENXIO);
510 	}
511 	if (mtx_trylock(&sq->lock)) {
512 		ret = mlx5e_xmit_locked(ifp, sq, mb);
513 		mtx_unlock(&sq->lock);
514 	} else {
515 		ret = drbr_enqueue(ifp, sq->br, mb);
516 		taskqueue_enqueue(sq->sq_tq, &sq->sq_task);
517 	}
518 
519 	return (ret);
520 }
521 
522 void
523 mlx5e_tx_cq_comp(struct mlx5_core_cq *mcq)
524 {
525 	struct mlx5e_sq *sq = container_of(mcq, struct mlx5e_sq, cq.mcq);
526 
527 	mtx_lock(&sq->comp_lock);
528 	mlx5e_poll_tx_cq(sq, MLX5E_BUDGET_MAX);
529 	mlx5e_cq_arm(&sq->cq);
530 	mtx_unlock(&sq->comp_lock);
531 }
532 
533 void
534 mlx5e_tx_que(void *context, int pending)
535 {
536 	struct mlx5e_sq *sq = context;
537 	struct ifnet *ifp = sq->channel->ifp;
538 
539 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
540 		mtx_lock(&sq->lock);
541 		if (!drbr_empty(ifp, sq->br))
542 			mlx5e_xmit_locked(ifp, sq, NULL);
543 		mtx_unlock(&sq->lock);
544 	}
545 }
546