1 /* SPDX-License-Identifier: GPL-2.0 */
2 /* Copyright (c) 2019 Mellanox Technologies. */
3
4 #include <net/netdev_lock.h>
5
6 #include "health.h"
7 #include "en/ptp.h"
8 #include "en/devlink.h"
9 #include "lib/tout.h"
10
11 /* Keep this string array consistent with the MLX5E_SQ_STATE_* enums in en.h */
12 static const char * const sq_sw_state_type_name[] = {
13 [MLX5E_SQ_STATE_ENABLED] = "enabled",
14 [MLX5E_SQ_STATE_MPWQE] = "mpwqe",
15 [MLX5E_SQ_STATE_RECOVERING] = "recovering",
16 [MLX5E_SQ_STATE_IPSEC] = "ipsec",
17 [MLX5E_SQ_STATE_DIM] = "dim",
18 [MLX5E_SQ_STATE_PENDING_XSK_TX] = "pending_xsk_tx",
19 [MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC] = "pending_tls_rx_resync",
20 [MLX5E_SQ_STATE_LOCK_NEEDED] = "lock_needed",
21 };
22
mlx5e_wait_for_sq_flush(struct mlx5e_txqsq * sq)23 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
24 {
25 struct mlx5_core_dev *dev = sq->mdev;
26 unsigned long exp_time;
27
28 exp_time = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FLUSH_ON_ERROR));
29
30 while (time_before(jiffies, exp_time)) {
31 if (sq->cc == sq->pc)
32 return 0;
33
34 msleep(20);
35 }
36
37 netdev_err(sq->netdev,
38 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
39 sq->sqn, sq->cc, sq->pc);
40
41 return -ETIMEDOUT;
42 }
43
mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq * sq)44 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
45 {
46 WARN_ONCE(sq->cc != sq->pc,
47 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
48 sq->sqn, sq->cc, sq->pc);
49 sq->cc = 0;
50 sq->pc = 0;
51 }
52
mlx5e_health_sq_put_sw_state(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * sq)53 static void mlx5e_health_sq_put_sw_state(struct devlink_fmsg *fmsg, struct mlx5e_txqsq *sq)
54 {
55 int i;
56
57 BUILD_BUG_ON_MSG(ARRAY_SIZE(sq_sw_state_type_name) != MLX5E_NUM_SQ_STATES,
58 "sq_sw_state_type_name string array must be consistent with MLX5E_SQ_STATE_* enum in en.h");
59 mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SW State");
60
61 for (i = 0; i < ARRAY_SIZE(sq_sw_state_type_name); ++i)
62 devlink_fmsg_u32_pair_put(fmsg, sq_sw_state_type_name[i],
63 test_bit(i, &sq->state));
64
65 mlx5e_health_fmsg_named_obj_nest_end(fmsg);
66 }
67
mlx5e_tx_reporter_err_cqe_recover(void * ctx)68 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
69 {
70 struct mlx5_core_dev *mdev;
71 struct net_device *dev;
72 struct mlx5e_txqsq *sq;
73 u8 state;
74 int err;
75
76 sq = ctx;
77 mdev = sq->mdev;
78 dev = sq->netdev;
79
80 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
81 return 0;
82
83 /* Recovering queues means re-enabling NAPI, which requires the netdev
84 * instance lock. However, SQ closing flows have to wait for work tasks
85 * to finish while also holding the netdev instance lock. So either get
86 * the lock or find that the SQ is no longer enabled and thus this work
87 * is not relevant anymore.
88 */
89 while (!netdev_trylock(dev)) {
90 if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
91 return 0;
92 msleep(20);
93 }
94
95 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
96 if (err) {
97 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
98 sq->sqn, err);
99 goto out;
100 }
101
102 if (state != MLX5_SQC_STATE_ERR)
103 goto out;
104
105 mlx5e_tx_disable_queue(sq->txq);
106
107 err = mlx5e_wait_for_sq_flush(sq);
108 if (err)
109 goto out;
110
111 /* At this point, no new packets will arrive from the stack as TXQ is
112 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
113 * pending WQEs. SQ can safely reset the SQ.
114 */
115
116 err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn);
117 if (err)
118 goto out;
119
120 mlx5e_reset_txqsq_cc_pc(sq);
121 sq->stats->recover++;
122 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
123 mlx5e_activate_txqsq(sq);
124
125 if (sq->channel)
126 mlx5e_trigger_napi_icosq(sq->channel);
127 else
128 mlx5e_trigger_napi_sched(sq->cq.napi);
129
130 netdev_unlock(dev);
131 return 0;
132 out:
133 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
134 netdev_unlock(dev);
135 return err;
136 }
137
138 struct mlx5e_tx_timeout_ctx {
139 struct mlx5e_txqsq *sq;
140 signed int status;
141 };
142
mlx5e_tx_reporter_timeout_recover(void * ctx)143 static int mlx5e_tx_reporter_timeout_recover(void *ctx)
144 {
145 struct mlx5e_tx_timeout_ctx *to_ctx;
146 struct mlx5e_priv *priv;
147 struct mlx5_eq_comp *eq;
148 struct mlx5e_txqsq *sq;
149 int err;
150
151 to_ctx = ctx;
152 sq = to_ctx->sq;
153 eq = sq->cq.mcq.eq;
154 priv = sq->priv;
155
156 /* Recovering the TX queues implies re-enabling NAPI, which requires
157 * the netdev instance lock.
158 * However, channel closing flows have to wait for this work to finish
159 * while holding the same lock. So either get the lock or find that
160 * channels are being closed for other reason and this work is not
161 * relevant anymore.
162 */
163 while (!netdev_trylock(sq->netdev)) {
164 if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state))
165 return 0;
166 msleep(20);
167 }
168
169 err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats);
170 if (!err) {
171 to_ctx->status = 0; /* this sq recovered */
172 goto out;
173 }
174
175 mutex_lock(&priv->state_lock);
176 err = mlx5e_safe_reopen_channels(priv);
177 mutex_unlock(&priv->state_lock);
178 if (!err) {
179 to_ctx->status = 1; /* all channels recovered */
180 goto out;
181 }
182
183 to_ctx->status = err;
184 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
185 netdev_err(priv->netdev,
186 "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
187 err);
188 out:
189 netdev_unlock(sq->netdev);
190 return err;
191 }
192
mlx5e_tx_reporter_ptpsq_unhealthy_recover(void * ctx)193 static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
194 {
195 struct mlx5e_ptpsq *ptpsq = ctx;
196 struct mlx5e_channels *chs;
197 struct net_device *netdev;
198 struct mlx5e_priv *priv;
199 int carrier_ok;
200 int err;
201
202 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &ptpsq->txqsq.state))
203 return 0;
204
205 priv = ptpsq->txqsq.priv;
206 netdev = priv->netdev;
207
208 /* Recovering the PTP SQ means re-enabling NAPI, which requires the
209 * netdev instance lock. However, SQ closing has to wait for this work
210 * task to finish while also holding the same lock. So either get the
211 * lock or find that the SQ is no longer enabled and thus this work is
212 * not relevant anymore.
213 */
214 while (!netdev_trylock(netdev)) {
215 if (!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state))
216 return 0;
217 msleep(20);
218 }
219
220 mutex_lock(&priv->state_lock);
221 chs = &priv->channels;
222
223 carrier_ok = netif_carrier_ok(netdev);
224 netif_carrier_off(netdev);
225
226 mlx5e_deactivate_priv_channels(priv);
227
228 mlx5e_ptp_close(chs->ptp);
229 err = mlx5e_ptp_open(priv, &chs->params, chs->c[0]->lag_port, &chs->ptp);
230
231 mlx5e_activate_priv_channels(priv);
232
233 /* return carrier back if needed */
234 if (carrier_ok)
235 netif_carrier_on(netdev);
236
237 mutex_unlock(&priv->state_lock);
238 netdev_unlock(netdev);
239
240 return err;
241 }
242
243 /* state lock cannot be grabbed within this function.
244 * It can cause a dead lock or a read-after-free.
245 */
mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx * err_ctx)246 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
247 {
248 return err_ctx->recover(err_ctx->ctx);
249 }
250
mlx5e_tx_reporter_recover(struct devlink_health_reporter * reporter,void * context,struct netlink_ext_ack * extack)251 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
252 void *context,
253 struct netlink_ext_ack *extack)
254 {
255 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
256 struct mlx5e_err_ctx *err_ctx = context;
257
258 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) :
259 mlx5e_health_recover_channels(priv);
260 }
261
262 static void
mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * sq,int tc)263 mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg,
264 struct mlx5e_txqsq *sq, int tc)
265 {
266 bool stopped = netif_xmit_stopped(sq->txq);
267 u8 state;
268 int err;
269
270 devlink_fmsg_u32_pair_put(fmsg, "tc", tc);
271 devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix);
272 devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn);
273
274 err = mlx5_core_query_sq_state(sq->mdev, sq->sqn, &state);
275 if (!err)
276 devlink_fmsg_u8_pair_put(fmsg, "HW state", state);
277
278 devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped);
279 devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc);
280 devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc);
281 mlx5e_health_sq_put_sw_state(fmsg, sq);
282 mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg);
283 mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg);
284 }
285
286 static void
mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * sq,int tc)287 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg,
288 struct mlx5e_txqsq *sq, int tc)
289 {
290 devlink_fmsg_obj_nest_start(fmsg);
291 devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix);
292 mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc);
293 devlink_fmsg_obj_nest_end(fmsg);
294 }
295
296 static void
mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg * fmsg,struct mlx5e_ptpsq * ptpsq,int tc)297 mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg,
298 struct mlx5e_ptpsq *ptpsq, int tc)
299 {
300 devlink_fmsg_obj_nest_start(fmsg);
301 devlink_fmsg_string_pair_put(fmsg, "channel", "ptp");
302 mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, &ptpsq->txqsq, tc);
303 mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS");
304 mlx5e_health_cq_diag_fmsg(&ptpsq->ts_cq, fmsg);
305 mlx5e_health_fmsg_named_obj_nest_end(fmsg);
306 devlink_fmsg_obj_nest_end(fmsg);
307 }
308
309 static void
mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * txqsq)310 mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg,
311 struct mlx5e_txqsq *txqsq)
312 {
313 bool real_time = mlx5_is_real_time_sq(txqsq->mdev);
314 u32 sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq);
315 u32 sq_stride = MLX5_SEND_WQE_BB;
316
317 mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
318 devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride);
319 devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz);
320 devlink_fmsg_string_pair_put(fmsg, "ts_format", real_time ? "RT" : "FRC");
321 mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg);
322 mlx5e_health_fmsg_named_obj_nest_end(fmsg);
323 }
324
325 static void
mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg * fmsg,struct mlx5e_ptpsq * ptpsq)326 mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg *fmsg,
327 struct mlx5e_ptpsq *ptpsq)
328 {
329 mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS");
330 mlx5e_health_cq_common_diag_fmsg(&ptpsq->ts_cq, fmsg);
331 mlx5e_health_fmsg_named_obj_nest_end(fmsg);
332 }
333
334 static void
mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg)335 mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter,
336 struct devlink_fmsg *fmsg)
337 {
338 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
339 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0];
340 struct mlx5e_ptp *ptp_ch = priv->channels.ptp;
341 struct mlx5e_ptpsq *generic_ptpsq;
342
343 mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config");
344 mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq);
345
346 if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state))
347 goto out;
348
349 generic_ptpsq = &ptp_ch->ptpsq[0];
350 mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP");
351 mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq);
352 mlx5e_tx_reporter_diagnose_generic_tx_port_ts(fmsg, generic_ptpsq);
353 mlx5e_health_fmsg_named_obj_nest_end(fmsg);
354 out:
355 mlx5e_health_fmsg_named_obj_nest_end(fmsg);
356 }
357
358 static void
mlx5e_tx_reporter_diagnose_tis_config(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg)359 mlx5e_tx_reporter_diagnose_tis_config(struct devlink_health_reporter *reporter,
360 struct devlink_fmsg *fmsg)
361 {
362 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
363 u8 num_tc = mlx5e_get_dcb_num_tc(&priv->channels.params);
364 u32 tc, i, tisn;
365
366 devlink_fmsg_arr_pair_nest_start(fmsg, "TIS Config");
367 for (i = 0; i < mlx5e_get_num_lag_ports(priv->mdev); i++) {
368 for (tc = 0; tc < num_tc; tc++) {
369 tisn = mlx5e_profile_get_tisn(priv->mdev, priv,
370 priv->profile, i, tc);
371
372 devlink_fmsg_obj_nest_start(fmsg);
373 devlink_fmsg_u32_pair_put(fmsg, "lag port", i);
374 devlink_fmsg_u32_pair_put(fmsg, "tc", tc);
375 devlink_fmsg_u32_pair_put(fmsg, "tisn", tisn);
376 devlink_fmsg_obj_nest_end(fmsg);
377 }
378 }
379 devlink_fmsg_arr_pair_nest_end(fmsg);
380 }
381
mlx5e_tx_reporter_diagnose(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg,struct netlink_ext_ack * extack)382 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
383 struct devlink_fmsg *fmsg,
384 struct netlink_ext_ack *extack)
385 {
386 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
387 struct mlx5e_ptp *ptp_ch = priv->channels.ptp;
388
389 int i, tc;
390
391 mutex_lock(&priv->state_lock);
392
393 if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
394 goto unlock;
395
396 mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg);
397 mlx5e_tx_reporter_diagnose_tis_config(reporter, fmsg);
398 devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
399
400 for (i = 0; i < priv->channels.num; i++) {
401 struct mlx5e_channel *c = priv->channels.c[i];
402
403 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) {
404 struct mlx5e_txqsq *sq = &c->sq[tc];
405
406 mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc);
407 }
408 }
409
410 if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state))
411 goto close_sqs_nest;
412
413 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++)
414 mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg,
415 &ptp_ch->ptpsq[tc],
416 tc);
417
418 close_sqs_nest:
419 devlink_fmsg_arr_pair_nest_end(fmsg);
420 unlock:
421 mutex_unlock(&priv->state_lock);
422 return 0;
423 }
424
mlx5e_tx_reporter_dump_sq(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,void * ctx)425 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
426 void *ctx)
427 {
428 struct mlx5_rsc_key key = {};
429 struct mlx5e_txqsq *sq = ctx;
430
431 if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
432 return 0;
433
434 mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
435 key.size = PAGE_SIZE;
436 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
437 mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
438 mlx5e_health_fmsg_named_obj_nest_end(fmsg);
439
440 mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
441 mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC");
442 key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
443 key.index1 = sq->sqn;
444 key.num_of_obj1 = 1;
445 mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
446 mlx5e_health_fmsg_named_obj_nest_end(fmsg);
447
448 mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff");
449 key.rsc = MLX5_SGMT_TYPE_SND_BUFF;
450 key.num_of_obj2 = MLX5_RSC_DUMP_ALL;
451 mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
452 mlx5e_health_fmsg_named_obj_nest_end(fmsg);
453
454 mlx5e_health_fmsg_named_obj_nest_end(fmsg);
455
456 return 0;
457 }
458
mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,void * ctx)459 static int mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
460 void *ctx)
461 {
462 struct mlx5e_tx_timeout_ctx *to_ctx = ctx;
463
464 return mlx5e_tx_reporter_dump_sq(priv, fmsg, to_ctx->sq);
465 }
466
mlx5e_tx_reporter_ptpsq_unhealthy_dump(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,void * ctx)467 static int mlx5e_tx_reporter_ptpsq_unhealthy_dump(struct mlx5e_priv *priv,
468 struct devlink_fmsg *fmsg,
469 void *ctx)
470 {
471 struct mlx5e_ptpsq *ptpsq = ctx;
472
473 return mlx5e_tx_reporter_dump_sq(priv, fmsg, &ptpsq->txqsq);
474 }
475
mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg)476 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv,
477 struct devlink_fmsg *fmsg)
478 {
479 struct mlx5e_ptp *ptp_ch = priv->channels.ptp;
480 struct mlx5_rsc_key key = {};
481 int i, tc;
482
483 if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
484 return 0;
485
486 mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
487 key.size = PAGE_SIZE;
488 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
489 mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
490 mlx5e_health_fmsg_named_obj_nest_end(fmsg);
491 devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
492
493 for (i = 0; i < priv->channels.num; i++) {
494 struct mlx5e_channel *c = priv->channels.c[i];
495
496 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) {
497 struct mlx5e_txqsq *sq = &c->sq[tc];
498
499 mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ");
500 }
501 }
502
503 if (ptp_ch && test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) {
504 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) {
505 struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq;
506
507 mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ");
508 }
509 }
510
511 devlink_fmsg_arr_pair_nest_end(fmsg);
512 return 0;
513 }
514
mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv * priv,struct mlx5e_err_ctx * err_ctx,struct devlink_fmsg * fmsg)515 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv,
516 struct mlx5e_err_ctx *err_ctx,
517 struct devlink_fmsg *fmsg)
518 {
519 return err_ctx->dump(priv, fmsg, err_ctx->ctx);
520 }
521
mlx5e_tx_reporter_dump(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg,void * context,struct netlink_ext_ack * extack)522 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter,
523 struct devlink_fmsg *fmsg, void *context,
524 struct netlink_ext_ack *extack)
525 {
526 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
527 struct mlx5e_err_ctx *err_ctx = context;
528
529 return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) :
530 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg);
531 }
532
mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq * sq)533 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq)
534 {
535 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
536 struct mlx5e_priv *priv = sq->priv;
537 struct mlx5e_err_ctx err_ctx = {};
538
539 err_ctx.ctx = sq;
540 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover;
541 err_ctx.dump = mlx5e_tx_reporter_dump_sq;
542 snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn);
543
544 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
545 }
546
mlx5e_reporter_tx_timeout(struct mlx5e_txqsq * sq)547 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq)
548 {
549 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
550 struct mlx5e_tx_timeout_ctx to_ctx = {};
551 struct mlx5e_priv *priv = sq->priv;
552 struct mlx5e_err_ctx err_ctx = {};
553
554 to_ctx.sq = sq;
555 err_ctx.ctx = &to_ctx;
556 err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
557 err_ctx.dump = mlx5e_tx_reporter_timeout_dump;
558 snprintf(err_str, sizeof(err_str),
559 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u",
560 sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
561 jiffies_to_usecs(jiffies - READ_ONCE(sq->txq->trans_start)));
562
563 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
564 return to_ctx.status;
565 }
566
mlx5e_reporter_tx_ptpsq_unhealthy(struct mlx5e_ptpsq * ptpsq)567 void mlx5e_reporter_tx_ptpsq_unhealthy(struct mlx5e_ptpsq *ptpsq)
568 {
569 struct mlx5e_ptp_metadata_map *map = &ptpsq->metadata_map;
570 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
571 struct mlx5e_txqsq *txqsq = &ptpsq->txqsq;
572 struct mlx5e_cq *ts_cq = &ptpsq->ts_cq;
573 struct mlx5e_priv *priv = txqsq->priv;
574 struct mlx5e_err_ctx err_ctx = {};
575
576 err_ctx.ctx = ptpsq;
577 err_ctx.recover = mlx5e_tx_reporter_ptpsq_unhealthy_recover;
578 err_ctx.dump = mlx5e_tx_reporter_ptpsq_unhealthy_dump;
579 snprintf(err_str, sizeof(err_str),
580 "Unhealthy TX port TS queue: %d, SQ: 0x%x, CQ: 0x%x, Undelivered CQEs: %u Map Capacity: %u",
581 txqsq->ch_ix, txqsq->sqn, ts_cq->mcq.cqn, map->undelivered_counter, map->capacity);
582
583 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
584 }
585
586 #define MLX5E_REPORTER_TX_GRACEFUL_PERIOD 500
587 #define MLX5E_REPORTER_TX_BURST_PERIOD 500
588
589 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
590 .name = "tx",
591 .recover = mlx5e_tx_reporter_recover,
592 .diagnose = mlx5e_tx_reporter_diagnose,
593 .dump = mlx5e_tx_reporter_dump,
594 .default_graceful_period = MLX5E_REPORTER_TX_GRACEFUL_PERIOD,
595 .default_burst_period = MLX5E_REPORTER_TX_BURST_PERIOD,
596 };
597
mlx5e_reporter_tx_create(struct mlx5e_priv * priv)598 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv)
599 {
600 struct devlink_port *port = priv->netdev->devlink_port;
601 struct devlink_health_reporter *reporter;
602
603 reporter = devlink_port_health_reporter_create(port,
604 &mlx5_tx_reporter_ops,
605 priv);
606 if (IS_ERR(reporter)) {
607 netdev_warn(priv->netdev,
608 "Failed to create tx reporter, err = %pe\n",
609 reporter);
610 return;
611 }
612 priv->tx_reporter = reporter;
613 }
614
mlx5e_reporter_tx_destroy(struct mlx5e_priv * priv)615 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv)
616 {
617 if (!priv->tx_reporter)
618 return;
619
620 devlink_health_reporter_destroy(priv->tx_reporter);
621 priv->tx_reporter = NULL;
622 }
623