xref: /linux/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c (revision 2c7e63d702f6c4209c5af833308e7fcbc7d4ab17)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /* Copyright (c) 2019 Mellanox Technologies. */
3 
4 #include <net/netdev_lock.h>
5 
6 #include "health.h"
7 #include "en/ptp.h"
8 #include "en/devlink.h"
9 #include "lib/tout.h"
10 
11 /* Keep this string array consistent with the MLX5E_SQ_STATE_* enums in en.h */
12 static const char * const sq_sw_state_type_name[] = {
13 	[MLX5E_SQ_STATE_ENABLED] = "enabled",
14 	[MLX5E_SQ_STATE_MPWQE] = "mpwqe",
15 	[MLX5E_SQ_STATE_RECOVERING] = "recovering",
16 	[MLX5E_SQ_STATE_IPSEC] = "ipsec",
17 	[MLX5E_SQ_STATE_DIM] = "dim",
18 	[MLX5E_SQ_STATE_PENDING_XSK_TX] = "pending_xsk_tx",
19 	[MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC] = "pending_tls_rx_resync",
20 	[MLX5E_SQ_STATE_LOCK_NEEDED] = "lock_needed",
21 };
22 
mlx5e_wait_for_sq_flush(struct mlx5e_txqsq * sq)23 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
24 {
25 	struct mlx5_core_dev *dev = sq->mdev;
26 	unsigned long exp_time;
27 
28 	exp_time = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FLUSH_ON_ERROR));
29 
30 	while (time_before(jiffies, exp_time)) {
31 		if (sq->cc == sq->pc)
32 			return 0;
33 
34 		msleep(20);
35 	}
36 
37 	netdev_err(sq->netdev,
38 		   "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
39 		   sq->sqn, sq->cc, sq->pc);
40 
41 	return -ETIMEDOUT;
42 }
43 
mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq * sq)44 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
45 {
46 	WARN_ONCE(sq->cc != sq->pc,
47 		  "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
48 		  sq->sqn, sq->cc, sq->pc);
49 	sq->cc = 0;
50 	sq->pc = 0;
51 }
52 
mlx5e_health_sq_put_sw_state(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * sq)53 static void mlx5e_health_sq_put_sw_state(struct devlink_fmsg *fmsg, struct mlx5e_txqsq *sq)
54 {
55 	int i;
56 
57 	BUILD_BUG_ON_MSG(ARRAY_SIZE(sq_sw_state_type_name) != MLX5E_NUM_SQ_STATES,
58 			 "sq_sw_state_type_name string array must be consistent with MLX5E_SQ_STATE_* enum in en.h");
59 	mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SW State");
60 
61 	for (i = 0; i < ARRAY_SIZE(sq_sw_state_type_name); ++i)
62 		devlink_fmsg_u32_pair_put(fmsg, sq_sw_state_type_name[i],
63 					  test_bit(i, &sq->state));
64 
65 	mlx5e_health_fmsg_named_obj_nest_end(fmsg);
66 }
67 
mlx5e_tx_reporter_err_cqe_recover(void * ctx)68 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
69 {
70 	struct mlx5_core_dev *mdev;
71 	struct net_device *dev;
72 	struct mlx5e_txqsq *sq;
73 	u8 state;
74 	int err;
75 
76 	sq = ctx;
77 	mdev = sq->mdev;
78 	dev = sq->netdev;
79 
80 	if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
81 		return 0;
82 
83 	/* Recovering queues means re-enabling NAPI, which requires the netdev
84 	 * instance lock. However, SQ closing flows have to wait for work tasks
85 	 * to finish while also holding the netdev instance lock. So either get
86 	 * the lock or find that the SQ is no longer enabled and thus this work
87 	 * is not relevant anymore.
88 	 */
89 	while (!netdev_trylock(dev)) {
90 		if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
91 			return 0;
92 		msleep(20);
93 	}
94 
95 	err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
96 	if (err) {
97 		netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
98 			   sq->sqn, err);
99 		goto out;
100 	}
101 
102 	if (state != MLX5_SQC_STATE_ERR)
103 		goto out;
104 
105 	mlx5e_tx_disable_queue(sq->txq);
106 
107 	err = mlx5e_wait_for_sq_flush(sq);
108 	if (err)
109 		goto out;
110 
111 	/* At this point, no new packets will arrive from the stack as TXQ is
112 	 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
113 	 * pending WQEs. SQ can safely reset the SQ.
114 	 */
115 
116 	err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn);
117 	if (err)
118 		goto out;
119 
120 	mlx5e_reset_txqsq_cc_pc(sq);
121 	sq->stats->recover++;
122 	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
123 	mlx5e_activate_txqsq(sq);
124 
125 	if (sq->channel)
126 		mlx5e_trigger_napi_icosq(sq->channel);
127 	else
128 		mlx5e_trigger_napi_sched(sq->cq.napi);
129 
130 	netdev_unlock(dev);
131 	return 0;
132 out:
133 	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
134 	netdev_unlock(dev);
135 	return err;
136 }
137 
138 struct mlx5e_tx_timeout_ctx {
139 	struct mlx5e_txqsq *sq;
140 	signed int status;
141 };
142 
mlx5e_tx_reporter_timeout_recover(void * ctx)143 static int mlx5e_tx_reporter_timeout_recover(void *ctx)
144 {
145 	struct mlx5e_tx_timeout_ctx *to_ctx;
146 	struct mlx5e_priv *priv;
147 	struct mlx5_eq_comp *eq;
148 	struct mlx5e_txqsq *sq;
149 	int err;
150 
151 	to_ctx = ctx;
152 	sq = to_ctx->sq;
153 	eq = sq->cq.mcq.eq;
154 	priv = sq->priv;
155 
156 	/* Recovering the TX queues implies re-enabling NAPI, which requires
157 	 * the netdev instance lock.
158 	 * However, channel closing flows have to wait for this work to finish
159 	 * while holding the same lock. So either get the lock or find that
160 	 * channels are being closed for other reason and this work is not
161 	 * relevant anymore.
162 	 */
163 	while (!netdev_trylock(sq->netdev)) {
164 		if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state))
165 			return 0;
166 		msleep(20);
167 	}
168 
169 	err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats);
170 	if (!err) {
171 		to_ctx->status = 0; /* this sq recovered */
172 		goto out;
173 	}
174 
175 	mutex_lock(&priv->state_lock);
176 	err = mlx5e_safe_reopen_channels(priv);
177 	mutex_unlock(&priv->state_lock);
178 	if (!err) {
179 		to_ctx->status = 1; /* all channels recovered */
180 		goto out;
181 	}
182 
183 	to_ctx->status = err;
184 	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
185 	netdev_err(priv->netdev,
186 		   "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
187 		   err);
188 out:
189 	netdev_unlock(sq->netdev);
190 	return err;
191 }
192 
mlx5e_tx_reporter_ptpsq_unhealthy_recover(void * ctx)193 static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
194 {
195 	struct mlx5e_ptpsq *ptpsq = ctx;
196 	struct mlx5e_channels *chs;
197 	struct net_device *netdev;
198 	struct mlx5e_priv *priv;
199 	int carrier_ok;
200 	int err;
201 
202 	if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &ptpsq->txqsq.state))
203 		return 0;
204 
205 	priv = ptpsq->txqsq.priv;
206 	netdev = priv->netdev;
207 
208 	/* Recovering the PTP SQ means re-enabling NAPI, which requires the
209 	 * netdev instance lock. However, SQ closing has to wait for this work
210 	 * task to finish while also holding the same lock. So either get the
211 	 * lock or find that the SQ is no longer enabled and thus this work is
212 	 * not relevant anymore.
213 	 */
214 	while (!netdev_trylock(netdev)) {
215 		if (!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state))
216 			return 0;
217 		msleep(20);
218 	}
219 
220 	mutex_lock(&priv->state_lock);
221 	chs = &priv->channels;
222 
223 	carrier_ok = netif_carrier_ok(netdev);
224 	netif_carrier_off(netdev);
225 
226 	mlx5e_deactivate_priv_channels(priv);
227 
228 	mlx5e_ptp_close(chs->ptp);
229 	err = mlx5e_ptp_open(priv, &chs->params, chs->c[0]->lag_port, &chs->ptp);
230 
231 	mlx5e_activate_priv_channels(priv);
232 
233 	/* return carrier back if needed */
234 	if (carrier_ok)
235 		netif_carrier_on(netdev);
236 
237 	mutex_unlock(&priv->state_lock);
238 	netdev_unlock(netdev);
239 
240 	return err;
241 }
242 
243 /* state lock cannot be grabbed within this function.
244  * It can cause a dead lock or a read-after-free.
245  */
mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx * err_ctx)246 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
247 {
248 	return err_ctx->recover(err_ctx->ctx);
249 }
250 
mlx5e_tx_reporter_recover(struct devlink_health_reporter * reporter,void * context,struct netlink_ext_ack * extack)251 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
252 				     void *context,
253 				     struct netlink_ext_ack *extack)
254 {
255 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
256 	struct mlx5e_err_ctx *err_ctx = context;
257 
258 	return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) :
259 			 mlx5e_health_recover_channels(priv);
260 }
261 
262 static void
mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * sq,int tc)263 mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg,
264 						  struct mlx5e_txqsq *sq, int tc)
265 {
266 	bool stopped = netif_xmit_stopped(sq->txq);
267 	u8 state;
268 	int err;
269 
270 	devlink_fmsg_u32_pair_put(fmsg, "tc", tc);
271 	devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix);
272 	devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn);
273 
274 	err = mlx5_core_query_sq_state(sq->mdev, sq->sqn, &state);
275 	if (!err)
276 		devlink_fmsg_u8_pair_put(fmsg, "HW state", state);
277 
278 	devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped);
279 	devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc);
280 	devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc);
281 	mlx5e_health_sq_put_sw_state(fmsg, sq);
282 	mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg);
283 	mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg);
284 }
285 
286 static void
mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * sq,int tc)287 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg,
288 					struct mlx5e_txqsq *sq, int tc)
289 {
290 	devlink_fmsg_obj_nest_start(fmsg);
291 	devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix);
292 	mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc);
293 	devlink_fmsg_obj_nest_end(fmsg);
294 }
295 
296 static void
mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg * fmsg,struct mlx5e_ptpsq * ptpsq,int tc)297 mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg,
298 					      struct mlx5e_ptpsq *ptpsq, int tc)
299 {
300 	devlink_fmsg_obj_nest_start(fmsg);
301 	devlink_fmsg_string_pair_put(fmsg, "channel", "ptp");
302 	mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, &ptpsq->txqsq, tc);
303 	mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS");
304 	mlx5e_health_cq_diag_fmsg(&ptpsq->ts_cq, fmsg);
305 	mlx5e_health_fmsg_named_obj_nest_end(fmsg);
306 	devlink_fmsg_obj_nest_end(fmsg);
307 }
308 
309 static void
mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * txqsq)310 mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg,
311 					 struct mlx5e_txqsq *txqsq)
312 {
313 	bool real_time =  mlx5_is_real_time_sq(txqsq->mdev);
314 	u32 sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq);
315 	u32 sq_stride = MLX5_SEND_WQE_BB;
316 
317 	mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
318 	devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride);
319 	devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz);
320 	devlink_fmsg_string_pair_put(fmsg, "ts_format", real_time ? "RT" : "FRC");
321 	mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg);
322 	mlx5e_health_fmsg_named_obj_nest_end(fmsg);
323 }
324 
325 static void
mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg * fmsg,struct mlx5e_ptpsq * ptpsq)326 mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg *fmsg,
327 					      struct mlx5e_ptpsq *ptpsq)
328 {
329 	mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS");
330 	mlx5e_health_cq_common_diag_fmsg(&ptpsq->ts_cq, fmsg);
331 	mlx5e_health_fmsg_named_obj_nest_end(fmsg);
332 }
333 
334 static void
mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg)335 mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter,
336 					 struct devlink_fmsg *fmsg)
337 {
338 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
339 	struct mlx5e_txqsq *generic_sq = priv->txq2sq[0];
340 	struct mlx5e_ptp *ptp_ch = priv->channels.ptp;
341 	struct mlx5e_ptpsq *generic_ptpsq;
342 
343 	mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config");
344 	mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq);
345 
346 	if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state))
347 		goto out;
348 
349 	generic_ptpsq = &ptp_ch->ptpsq[0];
350 	mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP");
351 	mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq);
352 	mlx5e_tx_reporter_diagnose_generic_tx_port_ts(fmsg, generic_ptpsq);
353 	mlx5e_health_fmsg_named_obj_nest_end(fmsg);
354 out:
355 	mlx5e_health_fmsg_named_obj_nest_end(fmsg);
356 }
357 
358 static void
mlx5e_tx_reporter_diagnose_tis_config(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg)359 mlx5e_tx_reporter_diagnose_tis_config(struct devlink_health_reporter *reporter,
360 				      struct devlink_fmsg *fmsg)
361 {
362 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
363 	u8 num_tc = mlx5e_get_dcb_num_tc(&priv->channels.params);
364 	u32 tc, i, tisn;
365 
366 	devlink_fmsg_arr_pair_nest_start(fmsg, "TIS Config");
367 	for (i = 0; i < mlx5e_get_num_lag_ports(priv->mdev); i++) {
368 		for (tc = 0; tc < num_tc; tc++) {
369 			tisn = mlx5e_profile_get_tisn(priv->mdev, priv,
370 						      priv->profile, i, tc);
371 
372 			devlink_fmsg_obj_nest_start(fmsg);
373 			devlink_fmsg_u32_pair_put(fmsg, "lag port", i);
374 			devlink_fmsg_u32_pair_put(fmsg, "tc", tc);
375 			devlink_fmsg_u32_pair_put(fmsg, "tisn", tisn);
376 			devlink_fmsg_obj_nest_end(fmsg);
377 		}
378 	}
379 	devlink_fmsg_arr_pair_nest_end(fmsg);
380 }
381 
mlx5e_tx_reporter_diagnose(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg,struct netlink_ext_ack * extack)382 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
383 				      struct devlink_fmsg *fmsg,
384 				      struct netlink_ext_ack *extack)
385 {
386 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
387 	struct mlx5e_ptp *ptp_ch = priv->channels.ptp;
388 
389 	int i, tc;
390 
391 	mutex_lock(&priv->state_lock);
392 
393 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
394 		goto unlock;
395 
396 	mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg);
397 	mlx5e_tx_reporter_diagnose_tis_config(reporter, fmsg);
398 	devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
399 
400 	for (i = 0; i < priv->channels.num; i++) {
401 		struct mlx5e_channel *c = priv->channels.c[i];
402 
403 		for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) {
404 			struct mlx5e_txqsq *sq = &c->sq[tc];
405 
406 			mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc);
407 		}
408 	}
409 
410 	if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state))
411 		goto close_sqs_nest;
412 
413 	for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++)
414 		mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg,
415 							      &ptp_ch->ptpsq[tc],
416 							      tc);
417 
418 close_sqs_nest:
419 	devlink_fmsg_arr_pair_nest_end(fmsg);
420 unlock:
421 	mutex_unlock(&priv->state_lock);
422 	return 0;
423 }
424 
mlx5e_tx_reporter_dump_sq(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,void * ctx)425 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
426 				     void *ctx)
427 {
428 	struct mlx5_rsc_key key = {};
429 	struct mlx5e_txqsq *sq = ctx;
430 
431 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
432 		return 0;
433 
434 	mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
435 	key.size = PAGE_SIZE;
436 	key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
437 	mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
438 	mlx5e_health_fmsg_named_obj_nest_end(fmsg);
439 
440 	mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
441 	mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC");
442 	key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
443 	key.index1 = sq->sqn;
444 	key.num_of_obj1 = 1;
445 	mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
446 	mlx5e_health_fmsg_named_obj_nest_end(fmsg);
447 
448 	mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff");
449 	key.rsc = MLX5_SGMT_TYPE_SND_BUFF;
450 	key.num_of_obj2 = MLX5_RSC_DUMP_ALL;
451 	mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
452 	mlx5e_health_fmsg_named_obj_nest_end(fmsg);
453 
454 	mlx5e_health_fmsg_named_obj_nest_end(fmsg);
455 
456 	return 0;
457 }
458 
mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,void * ctx)459 static int mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
460 					  void *ctx)
461 {
462 	struct mlx5e_tx_timeout_ctx *to_ctx = ctx;
463 
464 	return mlx5e_tx_reporter_dump_sq(priv, fmsg, to_ctx->sq);
465 }
466 
mlx5e_tx_reporter_ptpsq_unhealthy_dump(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,void * ctx)467 static int mlx5e_tx_reporter_ptpsq_unhealthy_dump(struct mlx5e_priv *priv,
468 						  struct devlink_fmsg *fmsg,
469 						  void *ctx)
470 {
471 	struct mlx5e_ptpsq *ptpsq = ctx;
472 
473 	return mlx5e_tx_reporter_dump_sq(priv, fmsg, &ptpsq->txqsq);
474 }
475 
mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg)476 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv,
477 					  struct devlink_fmsg *fmsg)
478 {
479 	struct mlx5e_ptp *ptp_ch = priv->channels.ptp;
480 	struct mlx5_rsc_key key = {};
481 	int i, tc;
482 
483 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
484 		return 0;
485 
486 	mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
487 	key.size = PAGE_SIZE;
488 	key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
489 	mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
490 	mlx5e_health_fmsg_named_obj_nest_end(fmsg);
491 	devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
492 
493 	for (i = 0; i < priv->channels.num; i++) {
494 		struct mlx5e_channel *c = priv->channels.c[i];
495 
496 		for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) {
497 			struct mlx5e_txqsq *sq = &c->sq[tc];
498 
499 			mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ");
500 		}
501 	}
502 
503 	if (ptp_ch && test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) {
504 		for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) {
505 			struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq;
506 
507 			mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ");
508 		}
509 	}
510 
511 	devlink_fmsg_arr_pair_nest_end(fmsg);
512 	return 0;
513 }
514 
mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv * priv,struct mlx5e_err_ctx * err_ctx,struct devlink_fmsg * fmsg)515 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv,
516 					   struct mlx5e_err_ctx *err_ctx,
517 					   struct devlink_fmsg *fmsg)
518 {
519 	return err_ctx->dump(priv, fmsg, err_ctx->ctx);
520 }
521 
mlx5e_tx_reporter_dump(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg,void * context,struct netlink_ext_ack * extack)522 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter,
523 				  struct devlink_fmsg *fmsg, void *context,
524 				  struct netlink_ext_ack *extack)
525 {
526 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
527 	struct mlx5e_err_ctx *err_ctx = context;
528 
529 	return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) :
530 			 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg);
531 }
532 
mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq * sq)533 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq)
534 {
535 	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
536 	struct mlx5e_priv *priv = sq->priv;
537 	struct mlx5e_err_ctx err_ctx = {};
538 
539 	err_ctx.ctx = sq;
540 	err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover;
541 	err_ctx.dump = mlx5e_tx_reporter_dump_sq;
542 	snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn);
543 
544 	mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
545 }
546 
mlx5e_reporter_tx_timeout(struct mlx5e_txqsq * sq)547 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq)
548 {
549 	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
550 	struct mlx5e_tx_timeout_ctx to_ctx = {};
551 	struct mlx5e_priv *priv = sq->priv;
552 	struct mlx5e_err_ctx err_ctx = {};
553 
554 	to_ctx.sq = sq;
555 	err_ctx.ctx = &to_ctx;
556 	err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
557 	err_ctx.dump = mlx5e_tx_reporter_timeout_dump;
558 	snprintf(err_str, sizeof(err_str),
559 		 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u",
560 		 sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
561 		 jiffies_to_usecs(jiffies - READ_ONCE(sq->txq->trans_start)));
562 
563 	mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
564 	return to_ctx.status;
565 }
566 
mlx5e_reporter_tx_ptpsq_unhealthy(struct mlx5e_ptpsq * ptpsq)567 void mlx5e_reporter_tx_ptpsq_unhealthy(struct mlx5e_ptpsq *ptpsq)
568 {
569 	struct mlx5e_ptp_metadata_map *map = &ptpsq->metadata_map;
570 	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
571 	struct mlx5e_txqsq *txqsq = &ptpsq->txqsq;
572 	struct mlx5e_cq *ts_cq = &ptpsq->ts_cq;
573 	struct mlx5e_priv *priv = txqsq->priv;
574 	struct mlx5e_err_ctx err_ctx = {};
575 
576 	err_ctx.ctx = ptpsq;
577 	err_ctx.recover = mlx5e_tx_reporter_ptpsq_unhealthy_recover;
578 	err_ctx.dump = mlx5e_tx_reporter_ptpsq_unhealthy_dump;
579 	snprintf(err_str, sizeof(err_str),
580 		 "Unhealthy TX port TS queue: %d, SQ: 0x%x, CQ: 0x%x, Undelivered CQEs: %u Map Capacity: %u",
581 		 txqsq->ch_ix, txqsq->sqn, ts_cq->mcq.cqn, map->undelivered_counter, map->capacity);
582 
583 	mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
584 }
585 
586 #define MLX5E_REPORTER_TX_GRACEFUL_PERIOD 500
587 #define MLX5E_REPORTER_TX_BURST_PERIOD 500
588 
589 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
590 		.name = "tx",
591 		.recover = mlx5e_tx_reporter_recover,
592 		.diagnose = mlx5e_tx_reporter_diagnose,
593 		.dump = mlx5e_tx_reporter_dump,
594 		.default_graceful_period = MLX5E_REPORTER_TX_GRACEFUL_PERIOD,
595 		.default_burst_period = MLX5E_REPORTER_TX_BURST_PERIOD,
596 };
597 
mlx5e_reporter_tx_create(struct mlx5e_priv * priv)598 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv)
599 {
600 	struct devlink_port *port = priv->netdev->devlink_port;
601 	struct devlink_health_reporter *reporter;
602 
603 	reporter = devlink_port_health_reporter_create(port,
604 						       &mlx5_tx_reporter_ops,
605 						       priv);
606 	if (IS_ERR(reporter)) {
607 		netdev_warn(priv->netdev,
608 			    "Failed to create tx reporter, err = %pe\n",
609 			    reporter);
610 		return;
611 	}
612 	priv->tx_reporter = reporter;
613 }
614 
mlx5e_reporter_tx_destroy(struct mlx5e_priv * priv)615 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv)
616 {
617 	if (!priv->tx_reporter)
618 		return;
619 
620 	devlink_health_reporter_destroy(priv->tx_reporter);
621 	priv->tx_reporter = NULL;
622 }
623