xref: /linux/drivers/infiniband/hw/mlx5/counters.c (revision 62597edf6340191511bdf9a7f64fa315ddc58805)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3  * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
4  */
5 
6 #include "mlx5_ib.h"
7 #include <linux/mlx5/eswitch.h>
8 #include <linux/mlx5/vport.h>
9 #include "counters.h"
10 #include "ib_rep.h"
11 #include "qp.h"
12 
13 struct mlx5_ib_counter {
14 	const char *name;
15 	size_t offset;
16 	u32 type;
17 };
18 
19 #define INIT_Q_COUNTER(_name)		\
20 	{ .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
21 
22 #define INIT_VPORT_Q_COUNTER(_name)		\
23 	{ .name = "vport_" #_name, .offset =	\
24 		MLX5_BYTE_OFF(query_q_counter_out, _name)}
25 
26 static const struct mlx5_ib_counter basic_q_cnts[] = {
27 	INIT_Q_COUNTER(rx_write_requests),
28 	INIT_Q_COUNTER(rx_read_requests),
29 	INIT_Q_COUNTER(rx_atomic_requests),
30 	INIT_Q_COUNTER(rx_dct_connect),
31 	INIT_Q_COUNTER(out_of_buffer),
32 };
33 
34 static const struct mlx5_ib_counter out_of_seq_q_cnts[] = {
35 	INIT_Q_COUNTER(out_of_sequence),
36 };
37 
38 static const struct mlx5_ib_counter retrans_q_cnts[] = {
39 	INIT_Q_COUNTER(duplicate_request),
40 	INIT_Q_COUNTER(rnr_nak_retry_err),
41 	INIT_Q_COUNTER(packet_seq_err),
42 	INIT_Q_COUNTER(implied_nak_seq_err),
43 	INIT_Q_COUNTER(local_ack_timeout_err),
44 };
45 
46 static const struct mlx5_ib_counter vport_basic_q_cnts[] = {
47 	INIT_VPORT_Q_COUNTER(rx_write_requests),
48 	INIT_VPORT_Q_COUNTER(rx_read_requests),
49 	INIT_VPORT_Q_COUNTER(rx_atomic_requests),
50 	INIT_VPORT_Q_COUNTER(rx_dct_connect),
51 	INIT_VPORT_Q_COUNTER(out_of_buffer),
52 };
53 
54 static const struct mlx5_ib_counter vport_out_of_seq_q_cnts[] = {
55 	INIT_VPORT_Q_COUNTER(out_of_sequence),
56 };
57 
58 static const struct mlx5_ib_counter vport_retrans_q_cnts[] = {
59 	INIT_VPORT_Q_COUNTER(duplicate_request),
60 	INIT_VPORT_Q_COUNTER(rnr_nak_retry_err),
61 	INIT_VPORT_Q_COUNTER(packet_seq_err),
62 	INIT_VPORT_Q_COUNTER(implied_nak_seq_err),
63 	INIT_VPORT_Q_COUNTER(local_ack_timeout_err),
64 };
65 
66 #define INIT_CONG_COUNTER(_name)		\
67 	{ .name = #_name, .offset =	\
68 		MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)}
69 
70 static const struct mlx5_ib_counter cong_cnts[] = {
71 	INIT_CONG_COUNTER(rp_cnp_ignored),
72 	INIT_CONG_COUNTER(rp_cnp_handled),
73 	INIT_CONG_COUNTER(np_ecn_marked_roce_packets),
74 	INIT_CONG_COUNTER(np_cnp_sent),
75 };
76 
77 static const struct mlx5_ib_counter extended_err_cnts[] = {
78 	INIT_Q_COUNTER(resp_local_length_error),
79 	INIT_Q_COUNTER(resp_cqe_error),
80 	INIT_Q_COUNTER(req_cqe_error),
81 	INIT_Q_COUNTER(req_remote_invalid_request),
82 	INIT_Q_COUNTER(req_remote_access_errors),
83 	INIT_Q_COUNTER(resp_remote_access_errors),
84 	INIT_Q_COUNTER(resp_cqe_flush_error),
85 	INIT_Q_COUNTER(req_cqe_flush_error),
86 	INIT_Q_COUNTER(req_transport_retries_exceeded),
87 	INIT_Q_COUNTER(req_rnr_retries_exceeded),
88 };
89 
90 static const struct mlx5_ib_counter roce_accl_cnts[] = {
91 	INIT_Q_COUNTER(roce_adp_retrans),
92 	INIT_Q_COUNTER(roce_adp_retrans_to),
93 	INIT_Q_COUNTER(roce_slow_restart),
94 	INIT_Q_COUNTER(roce_slow_restart_cnps),
95 	INIT_Q_COUNTER(roce_slow_restart_trans),
96 };
97 
98 static const struct mlx5_ib_counter vport_extended_err_cnts[] = {
99 	INIT_VPORT_Q_COUNTER(resp_local_length_error),
100 	INIT_VPORT_Q_COUNTER(resp_cqe_error),
101 	INIT_VPORT_Q_COUNTER(req_cqe_error),
102 	INIT_VPORT_Q_COUNTER(req_remote_invalid_request),
103 	INIT_VPORT_Q_COUNTER(req_remote_access_errors),
104 	INIT_VPORT_Q_COUNTER(resp_remote_access_errors),
105 	INIT_VPORT_Q_COUNTER(resp_cqe_flush_error),
106 	INIT_VPORT_Q_COUNTER(req_cqe_flush_error),
107 	INIT_VPORT_Q_COUNTER(req_transport_retries_exceeded),
108 	INIT_VPORT_Q_COUNTER(req_rnr_retries_exceeded),
109 };
110 
111 static const struct mlx5_ib_counter vport_roce_accl_cnts[] = {
112 	INIT_VPORT_Q_COUNTER(roce_adp_retrans),
113 	INIT_VPORT_Q_COUNTER(roce_adp_retrans_to),
114 	INIT_VPORT_Q_COUNTER(roce_slow_restart),
115 	INIT_VPORT_Q_COUNTER(roce_slow_restart_cnps),
116 	INIT_VPORT_Q_COUNTER(roce_slow_restart_trans),
117 };
118 
119 #define INIT_EXT_PPCNT_COUNTER(_name)		\
120 	{ .name = #_name, .offset =	\
121 	MLX5_BYTE_OFF(ppcnt_reg, \
122 		      counter_set.eth_extended_cntrs_grp_data_layout._name##_high)}
123 
124 static const struct mlx5_ib_counter ext_ppcnt_cnts[] = {
125 	INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated),
126 };
127 
128 #define INIT_OP_COUNTER(_name, _type)		\
129 	{ .name = #_name, .type = MLX5_IB_OPCOUNTER_##_type}
130 
131 static const struct mlx5_ib_counter basic_op_cnts[] = {
132 	INIT_OP_COUNTER(cc_rx_ce_pkts, CC_RX_CE_PKTS),
133 };
134 
135 static const struct mlx5_ib_counter rdmarx_cnp_op_cnts[] = {
136 	INIT_OP_COUNTER(cc_rx_cnp_pkts, CC_RX_CNP_PKTS),
137 };
138 
139 static const struct mlx5_ib_counter rdmatx_cnp_op_cnts[] = {
140 	INIT_OP_COUNTER(cc_tx_cnp_pkts, CC_TX_CNP_PKTS),
141 };
142 
143 static int mlx5_ib_read_counters(struct ib_counters *counters,
144 				 struct ib_counters_read_attr *read_attr,
145 				 struct uverbs_attr_bundle *attrs)
146 {
147 	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
148 	struct mlx5_read_counters_attr mread_attr = {};
149 	struct mlx5_ib_flow_counters_desc *desc;
150 	int ret, i;
151 
152 	mutex_lock(&mcounters->mcntrs_mutex);
153 	if (mcounters->cntrs_max_index > read_attr->ncounters) {
154 		ret = -EINVAL;
155 		goto err_bound;
156 	}
157 
158 	mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64),
159 				 GFP_KERNEL);
160 	if (!mread_attr.out) {
161 		ret = -ENOMEM;
162 		goto err_bound;
163 	}
164 
165 	mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl;
166 	mread_attr.flags = read_attr->flags;
167 	ret = mcounters->read_counters(counters->device, &mread_attr);
168 	if (ret)
169 		goto err_read;
170 
171 	/* do the pass over the counters data array to assign according to the
172 	 * descriptions and indexing pairs
173 	 */
174 	desc = mcounters->counters_data;
175 	for (i = 0; i < mcounters->ncounters; i++)
176 		read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description];
177 
178 err_read:
179 	kfree(mread_attr.out);
180 err_bound:
181 	mutex_unlock(&mcounters->mcntrs_mutex);
182 	return ret;
183 }
184 
185 static int mlx5_ib_destroy_counters(struct ib_counters *counters)
186 {
187 	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
188 
189 	mlx5_ib_counters_clear_description(counters);
190 	if (mcounters->hw_cntrs_hndl)
191 		mlx5_fc_destroy(to_mdev(counters->device)->mdev,
192 				mcounters->hw_cntrs_hndl);
193 	return 0;
194 }
195 
196 static int mlx5_ib_create_counters(struct ib_counters *counters,
197 				   struct uverbs_attr_bundle *attrs)
198 {
199 	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
200 
201 	mutex_init(&mcounters->mcntrs_mutex);
202 	return 0;
203 }
204 
205 static bool vport_qcounters_supported(struct mlx5_ib_dev *dev)
206 {
207 	return MLX5_CAP_GEN(dev->mdev, q_counter_other_vport) &&
208 	       MLX5_CAP_GEN(dev->mdev, q_counter_aggregation);
209 }
210 
211 static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
212 						   u32 port_num)
213 {
214 	if ((is_mdev_switchdev_mode(dev->mdev) &&
215 	     !vport_qcounters_supported(dev)) || !port_num)
216 		return &dev->port[0].cnts;
217 
218 	return is_mdev_switchdev_mode(dev->mdev) ?
219 	       &dev->port[1].cnts : &dev->port[port_num - 1].cnts;
220 }
221 
222 /**
223  * mlx5_ib_get_counters_id - Returns counters id to use for device+port
224  * @dev:	Pointer to mlx5 IB device
225  * @port_num:	Zero based port number
226  *
227  * mlx5_ib_get_counters_id() Returns counters set id to use for given
228  * device port combination in switchdev and non switchdev mode of the
229  * parent device.
230  */
231 u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u32 port_num)
232 {
233 	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num + 1);
234 
235 	return cnts->set_id;
236 }
237 
238 static struct rdma_hw_stats *do_alloc_stats(const struct mlx5_ib_counters *cnts)
239 {
240 	struct rdma_hw_stats *stats;
241 	u32 num_hw_counters;
242 	int i;
243 
244 	num_hw_counters = cnts->num_q_counters + cnts->num_cong_counters +
245 			  cnts->num_ext_ppcnt_counters;
246 	stats = rdma_alloc_hw_stats_struct(cnts->descs,
247 					   num_hw_counters +
248 					   cnts->num_op_counters,
249 					   RDMA_HW_STATS_DEFAULT_LIFESPAN);
250 	if (!stats)
251 		return NULL;
252 
253 	for (i = 0; i < cnts->num_op_counters; i++)
254 		set_bit(num_hw_counters + i, stats->is_disabled);
255 
256 	return stats;
257 }
258 
259 static struct rdma_hw_stats *
260 mlx5_ib_alloc_hw_device_stats(struct ib_device *ibdev)
261 {
262 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
263 	const struct mlx5_ib_counters *cnts = &dev->port[0].cnts;
264 
265 	return do_alloc_stats(cnts);
266 }
267 
268 static struct rdma_hw_stats *
269 mlx5_ib_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num)
270 {
271 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
272 	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num);
273 
274 	return do_alloc_stats(cnts);
275 }
276 
277 static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
278 				    const struct mlx5_ib_counters *cnts,
279 				    struct rdma_hw_stats *stats,
280 				    u16 set_id)
281 {
282 	u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {};
283 	u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {};
284 	__be32 val;
285 	int ret, i;
286 
287 	MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
288 	MLX5_SET(query_q_counter_in, in, counter_set_id, set_id);
289 	ret = mlx5_cmd_exec_inout(mdev, query_q_counter, in, out);
290 	if (ret)
291 		return ret;
292 
293 	for (i = 0; i < cnts->num_q_counters; i++) {
294 		val = *(__be32 *)((void *)out + cnts->offsets[i]);
295 		stats->value[i] = (u64)be32_to_cpu(val);
296 	}
297 
298 	return 0;
299 }
300 
301 static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev,
302 					    const struct mlx5_ib_counters *cnts,
303 					    struct rdma_hw_stats *stats)
304 {
305 	int offset = cnts->num_q_counters + cnts->num_cong_counters;
306 	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
307 	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
308 	int ret, i;
309 	void *out;
310 
311 	out = kvzalloc(sz, GFP_KERNEL);
312 	if (!out)
313 		return -ENOMEM;
314 
315 	MLX5_SET(ppcnt_reg, in, local_port, 1);
316 	MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
317 	ret = mlx5_core_access_reg(dev->mdev, in, sz, out, sz, MLX5_REG_PPCNT,
318 				   0, 0);
319 	if (ret)
320 		goto free;
321 
322 	for (i = 0; i < cnts->num_ext_ppcnt_counters; i++)
323 		stats->value[i + offset] =
324 			be64_to_cpup((__be64 *)(out +
325 				    cnts->offsets[i + offset]));
326 free:
327 	kvfree(out);
328 	return ret;
329 }
330 
331 static int mlx5_ib_query_q_counters_vport(struct mlx5_ib_dev *dev,
332 					  u32 port_num,
333 					  const struct mlx5_ib_counters *cnts,
334 					  struct rdma_hw_stats *stats)
335 
336 {
337 	u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {};
338 	u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {};
339 	struct mlx5_core_dev *mdev;
340 	__be32 val;
341 	int ret, i;
342 
343 	if (!dev->port[port_num].rep ||
344 	    dev->port[port_num].rep->vport == MLX5_VPORT_UPLINK)
345 		return 0;
346 
347 	mdev = mlx5_eswitch_get_core_dev(dev->port[port_num].rep->esw);
348 	if (!mdev)
349 		return -EOPNOTSUPP;
350 
351 	MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
352 	MLX5_SET(query_q_counter_in, in, other_vport, 1);
353 	MLX5_SET(query_q_counter_in, in, vport_number,
354 		 dev->port[port_num].rep->vport);
355 	MLX5_SET(query_q_counter_in, in, aggregate, 1);
356 	ret = mlx5_cmd_exec_inout(mdev, query_q_counter, in, out);
357 	if (ret)
358 		return ret;
359 
360 	for (i = 0; i < cnts->num_q_counters; i++) {
361 		val = *(__be32 *)((void *)out + cnts->offsets[i]);
362 		stats->value[i] = (u64)be32_to_cpu(val);
363 	}
364 
365 	return 0;
366 }
367 
368 static int do_get_hw_stats(struct ib_device *ibdev,
369 			   struct rdma_hw_stats *stats,
370 			   u32 port_num, int index)
371 {
372 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
373 	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num);
374 	struct mlx5_core_dev *mdev;
375 	int ret, num_counters;
376 
377 	if (!stats)
378 		return -EINVAL;
379 
380 	num_counters = cnts->num_q_counters +
381 		       cnts->num_cong_counters +
382 		       cnts->num_ext_ppcnt_counters;
383 
384 	if (is_mdev_switchdev_mode(dev->mdev) && dev->is_rep && port_num != 0)
385 		ret = mlx5_ib_query_q_counters_vport(dev, port_num - 1, cnts,
386 						     stats);
387 	else
388 		ret = mlx5_ib_query_q_counters(dev->mdev, cnts, stats,
389 					       cnts->set_id);
390 	if (ret)
391 		return ret;
392 
393 	/* We don't expose device counters over Vports */
394 	if (is_mdev_switchdev_mode(dev->mdev) && port_num != 0)
395 		goto done;
396 
397 	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
398 		ret =  mlx5_ib_query_ext_ppcnt_counters(dev, cnts, stats);
399 		if (ret)
400 			return ret;
401 	}
402 
403 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
404 		if (!port_num)
405 			port_num = 1;
406 		mdev = mlx5_ib_get_native_port_mdev(dev, port_num, NULL);
407 		if (!mdev) {
408 			/* If port is not affiliated yet, its in down state
409 			 * which doesn't have any counters yet, so it would be
410 			 * zero. So no need to read from the HCA.
411 			 */
412 			goto done;
413 		}
414 		ret = mlx5_lag_query_cong_counters(dev->mdev,
415 						   stats->value +
416 						   cnts->num_q_counters,
417 						   cnts->num_cong_counters,
418 						   cnts->offsets +
419 						   cnts->num_q_counters);
420 
421 		mlx5_ib_put_native_port_mdev(dev, port_num);
422 		if (ret)
423 			return ret;
424 	}
425 
426 done:
427 	return num_counters;
428 }
429 
430 static int do_get_op_stat(struct ib_device *ibdev,
431 			  struct rdma_hw_stats *stats,
432 			  u32 port_num, int index)
433 {
434 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
435 	const struct mlx5_ib_counters *cnts;
436 	const struct mlx5_ib_op_fc *opfcs;
437 	u64 packets = 0, bytes;
438 	u32 type;
439 	int ret;
440 
441 	cnts = get_counters(dev, port_num);
442 
443 	opfcs = cnts->opfcs;
444 	type = *(u32 *)cnts->descs[index].priv;
445 	if (type >= MLX5_IB_OPCOUNTER_MAX)
446 		return -EINVAL;
447 
448 	if (!opfcs[type].fc)
449 		goto out;
450 
451 	ret = mlx5_fc_query(dev->mdev, opfcs[type].fc,
452 			    &packets, &bytes);
453 	if (ret)
454 		return ret;
455 
456 out:
457 	stats->value[index] = packets;
458 	return index;
459 }
460 
461 static int do_get_op_stats(struct ib_device *ibdev,
462 			   struct rdma_hw_stats *stats,
463 			   u32 port_num)
464 {
465 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
466 	const struct mlx5_ib_counters *cnts;
467 	int index, ret, num_hw_counters;
468 
469 	cnts = get_counters(dev, port_num);
470 	num_hw_counters = cnts->num_q_counters + cnts->num_cong_counters +
471 			  cnts->num_ext_ppcnt_counters;
472 	for (index = num_hw_counters;
473 	     index < (num_hw_counters + cnts->num_op_counters); index++) {
474 		ret = do_get_op_stat(ibdev, stats, port_num, index);
475 		if (ret != index)
476 			return ret;
477 	}
478 
479 	return cnts->num_op_counters;
480 }
481 
482 static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
483 				struct rdma_hw_stats *stats,
484 				u32 port_num, int index)
485 {
486 	int num_counters, num_hw_counters, num_op_counters;
487 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
488 	const struct mlx5_ib_counters *cnts;
489 
490 	cnts = get_counters(dev, port_num);
491 	num_hw_counters = cnts->num_q_counters + cnts->num_cong_counters +
492 		cnts->num_ext_ppcnt_counters;
493 	num_counters = num_hw_counters + cnts->num_op_counters;
494 
495 	if (index < 0 || index > num_counters)
496 		return -EINVAL;
497 	else if (index > 0 && index < num_hw_counters)
498 		return do_get_hw_stats(ibdev, stats, port_num, index);
499 	else if (index >= num_hw_counters && index < num_counters)
500 		return do_get_op_stat(ibdev, stats, port_num, index);
501 
502 	num_hw_counters = do_get_hw_stats(ibdev, stats, port_num, index);
503 	if (num_hw_counters < 0)
504 		return num_hw_counters;
505 
506 	num_op_counters = do_get_op_stats(ibdev, stats, port_num);
507 	if (num_op_counters < 0)
508 		return num_op_counters;
509 
510 	return num_hw_counters + num_op_counters;
511 }
512 
513 static struct rdma_hw_stats *
514 mlx5_ib_counter_alloc_stats(struct rdma_counter *counter)
515 {
516 	struct mlx5_ib_dev *dev = to_mdev(counter->device);
517 	const struct mlx5_ib_counters *cnts = get_counters(dev, counter->port);
518 
519 	return do_alloc_stats(cnts);
520 }
521 
522 static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
523 {
524 	struct mlx5_ib_dev *dev = to_mdev(counter->device);
525 	const struct mlx5_ib_counters *cnts = get_counters(dev, counter->port);
526 
527 	return mlx5_ib_query_q_counters(dev->mdev, cnts,
528 					counter->stats, counter->id);
529 }
530 
531 static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
532 {
533 	struct mlx5_ib_dev *dev = to_mdev(counter->device);
534 	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {};
535 
536 	if (!counter->id)
537 		return 0;
538 
539 	MLX5_SET(dealloc_q_counter_in, in, opcode,
540 		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
541 	MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id);
542 	return mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in);
543 }
544 
545 static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
546 				   struct ib_qp *qp)
547 {
548 	struct mlx5_ib_dev *dev = to_mdev(qp->device);
549 	int err;
550 
551 	if (!counter->id) {
552 		u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {};
553 		u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {};
554 
555 		MLX5_SET(alloc_q_counter_in, in, opcode,
556 			 MLX5_CMD_OP_ALLOC_Q_COUNTER);
557 		MLX5_SET(alloc_q_counter_in, in, uid, MLX5_SHARED_RESOURCE_UID);
558 		err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out);
559 		if (err)
560 			return err;
561 		counter->id =
562 			MLX5_GET(alloc_q_counter_out, out, counter_set_id);
563 	}
564 
565 	err = mlx5_ib_qp_set_counter(qp, counter);
566 	if (err)
567 		goto fail_set_counter;
568 
569 	return 0;
570 
571 fail_set_counter:
572 	mlx5_ib_counter_dealloc(counter);
573 	counter->id = 0;
574 
575 	return err;
576 }
577 
578 static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp)
579 {
580 	return mlx5_ib_qp_set_counter(qp, NULL);
581 }
582 
583 static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
584 				  struct rdma_stat_desc *descs, size_t *offsets,
585 				  u32 port_num)
586 {
587 	bool is_vport = is_mdev_switchdev_mode(dev->mdev) &&
588 			port_num != MLX5_VPORT_PF;
589 	const struct mlx5_ib_counter *names;
590 	int j = 0, i, size;
591 
592 	names = is_vport ? vport_basic_q_cnts : basic_q_cnts;
593 	size = is_vport ? ARRAY_SIZE(vport_basic_q_cnts) :
594 			  ARRAY_SIZE(basic_q_cnts);
595 	for (i = 0; i < size; i++, j++) {
596 		descs[j].name = names[i].name;
597 		offsets[j] = names[i].offset;
598 	}
599 
600 	names = is_vport ? vport_out_of_seq_q_cnts : out_of_seq_q_cnts;
601 	size = is_vport ? ARRAY_SIZE(vport_out_of_seq_q_cnts) :
602 			  ARRAY_SIZE(out_of_seq_q_cnts);
603 	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
604 		for (i = 0; i < size; i++, j++) {
605 			descs[j].name = names[i].name;
606 			offsets[j] = names[i].offset;
607 		}
608 	}
609 
610 	names = is_vport ? vport_retrans_q_cnts : retrans_q_cnts;
611 	size = is_vport ? ARRAY_SIZE(vport_retrans_q_cnts) :
612 			  ARRAY_SIZE(retrans_q_cnts);
613 	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
614 		for (i = 0; i < size; i++, j++) {
615 			descs[j].name = names[i].name;
616 			offsets[j] = names[i].offset;
617 		}
618 	}
619 
620 	names = is_vport ? vport_extended_err_cnts : extended_err_cnts;
621 	size = is_vport ? ARRAY_SIZE(vport_extended_err_cnts) :
622 			  ARRAY_SIZE(extended_err_cnts);
623 	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) {
624 		for (i = 0; i < size; i++, j++) {
625 			descs[j].name = names[i].name;
626 			offsets[j] = names[i].offset;
627 		}
628 	}
629 
630 	names = is_vport ? vport_roce_accl_cnts : roce_accl_cnts;
631 	size = is_vport ? ARRAY_SIZE(vport_roce_accl_cnts) :
632 			  ARRAY_SIZE(roce_accl_cnts);
633 	if (MLX5_CAP_GEN(dev->mdev, roce_accl)) {
634 		for (i = 0; i < size; i++, j++) {
635 			descs[j].name = names[i].name;
636 			offsets[j] = names[i].offset;
637 		}
638 	}
639 
640 	if (is_vport)
641 		return;
642 
643 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
644 		for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
645 			descs[j].name = cong_cnts[i].name;
646 			offsets[j] = cong_cnts[i].offset;
647 		}
648 	}
649 
650 	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
651 		for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) {
652 			descs[j].name = ext_ppcnt_cnts[i].name;
653 			offsets[j] = ext_ppcnt_cnts[i].offset;
654 		}
655 	}
656 
657 	for (i = 0; i < ARRAY_SIZE(basic_op_cnts); i++, j++) {
658 		descs[j].name = basic_op_cnts[i].name;
659 		descs[j].flags |= IB_STAT_FLAG_OPTIONAL;
660 		descs[j].priv = &basic_op_cnts[i].type;
661 	}
662 
663 	if (MLX5_CAP_FLOWTABLE(dev->mdev,
664 			       ft_field_support_2_nic_receive_rdma.bth_opcode)) {
665 		for (i = 0; i < ARRAY_SIZE(rdmarx_cnp_op_cnts); i++, j++) {
666 			descs[j].name = rdmarx_cnp_op_cnts[i].name;
667 			descs[j].flags |= IB_STAT_FLAG_OPTIONAL;
668 			descs[j].priv = &rdmarx_cnp_op_cnts[i].type;
669 		}
670 	}
671 
672 	if (MLX5_CAP_FLOWTABLE(dev->mdev,
673 			       ft_field_support_2_nic_transmit_rdma.bth_opcode)) {
674 		for (i = 0; i < ARRAY_SIZE(rdmatx_cnp_op_cnts); i++, j++) {
675 			descs[j].name = rdmatx_cnp_op_cnts[i].name;
676 			descs[j].flags |= IB_STAT_FLAG_OPTIONAL;
677 			descs[j].priv = &rdmatx_cnp_op_cnts[i].type;
678 		}
679 	}
680 }
681 
682 
683 static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
684 				    struct mlx5_ib_counters *cnts, u32 port_num)
685 {
686 	bool is_vport = is_mdev_switchdev_mode(dev->mdev) &&
687 			port_num != MLX5_VPORT_PF;
688 	u32 num_counters, num_op_counters = 0, size;
689 
690 	size = is_vport ? ARRAY_SIZE(vport_basic_q_cnts) :
691 			  ARRAY_SIZE(basic_q_cnts);
692 	num_counters = size;
693 
694 	size = is_vport ? ARRAY_SIZE(vport_out_of_seq_q_cnts) :
695 			  ARRAY_SIZE(out_of_seq_q_cnts);
696 	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
697 		num_counters += size;
698 
699 	size = is_vport ? ARRAY_SIZE(vport_retrans_q_cnts) :
700 			  ARRAY_SIZE(retrans_q_cnts);
701 	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
702 		num_counters += size;
703 
704 	size = is_vport ? ARRAY_SIZE(vport_extended_err_cnts) :
705 			  ARRAY_SIZE(extended_err_cnts);
706 	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
707 		num_counters += size;
708 
709 	size = is_vport ? ARRAY_SIZE(vport_roce_accl_cnts) :
710 			  ARRAY_SIZE(roce_accl_cnts);
711 	if (MLX5_CAP_GEN(dev->mdev, roce_accl))
712 		num_counters += size;
713 
714 	cnts->num_q_counters = num_counters;
715 
716 	if (is_vport)
717 		goto skip_non_qcounters;
718 
719 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
720 		cnts->num_cong_counters = ARRAY_SIZE(cong_cnts);
721 		num_counters += ARRAY_SIZE(cong_cnts);
722 	}
723 	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
724 		cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts);
725 		num_counters += ARRAY_SIZE(ext_ppcnt_cnts);
726 	}
727 
728 	num_op_counters = ARRAY_SIZE(basic_op_cnts);
729 
730 	if (MLX5_CAP_FLOWTABLE(dev->mdev,
731 			       ft_field_support_2_nic_receive_rdma.bth_opcode))
732 		num_op_counters += ARRAY_SIZE(rdmarx_cnp_op_cnts);
733 
734 	if (MLX5_CAP_FLOWTABLE(dev->mdev,
735 			       ft_field_support_2_nic_transmit_rdma.bth_opcode))
736 		num_op_counters += ARRAY_SIZE(rdmatx_cnp_op_cnts);
737 
738 skip_non_qcounters:
739 	cnts->num_op_counters = num_op_counters;
740 	num_counters += num_op_counters;
741 	cnts->descs = kcalloc(num_counters,
742 			      sizeof(struct rdma_stat_desc), GFP_KERNEL);
743 	if (!cnts->descs)
744 		return -ENOMEM;
745 
746 	cnts->offsets = kcalloc(num_counters,
747 				sizeof(*cnts->offsets), GFP_KERNEL);
748 	if (!cnts->offsets)
749 		goto err;
750 
751 	return 0;
752 
753 err:
754 	kfree(cnts->descs);
755 	cnts->descs = NULL;
756 	return -ENOMEM;
757 }
758 
759 static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
760 {
761 	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {};
762 	int num_cnt_ports = dev->num_ports;
763 	int i, j;
764 
765 	if (is_mdev_switchdev_mode(dev->mdev))
766 		num_cnt_ports = min(2, num_cnt_ports);
767 
768 	MLX5_SET(dealloc_q_counter_in, in, opcode,
769 		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
770 
771 	for (i = 0; i < num_cnt_ports; i++) {
772 		if (dev->port[i].cnts.set_id) {
773 			MLX5_SET(dealloc_q_counter_in, in, counter_set_id,
774 				 dev->port[i].cnts.set_id);
775 			mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in);
776 		}
777 		kfree(dev->port[i].cnts.descs);
778 		kfree(dev->port[i].cnts.offsets);
779 
780 		for (j = 0; j < MLX5_IB_OPCOUNTER_MAX; j++) {
781 			if (!dev->port[i].cnts.opfcs[j].fc)
782 				continue;
783 
784 			if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
785 				mlx5_ib_fs_remove_op_fc(dev,
786 					&dev->port[i].cnts.opfcs[j], j);
787 			mlx5_fc_destroy(dev->mdev,
788 					dev->port[i].cnts.opfcs[j].fc);
789 			dev->port[i].cnts.opfcs[j].fc = NULL;
790 		}
791 	}
792 }
793 
794 static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
795 {
796 	u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {};
797 	u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {};
798 	int num_cnt_ports = dev->num_ports;
799 	int err = 0;
800 	int i;
801 	bool is_shared;
802 
803 	MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
804 	is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0;
805 
806 	/*
807 	 * In switchdev we need to allocate two ports, one that is used for
808 	 * the device Q_counters and it is essentially the real Q_counters of
809 	 * this device, while the other is used as a helper for PF to be able to
810 	 * query all other vports.
811 	 */
812 	if (is_mdev_switchdev_mode(dev->mdev))
813 		num_cnt_ports = min(2, num_cnt_ports);
814 
815 	for (i = 0; i < num_cnt_ports; i++) {
816 		err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts, i);
817 		if (err)
818 			goto err_alloc;
819 
820 		mlx5_ib_fill_counters(dev, dev->port[i].cnts.descs,
821 				      dev->port[i].cnts.offsets, i);
822 
823 		MLX5_SET(alloc_q_counter_in, in, uid,
824 			 is_shared ? MLX5_SHARED_RESOURCE_UID : 0);
825 
826 		err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out);
827 		if (err) {
828 			mlx5_ib_warn(dev,
829 				     "couldn't allocate queue counter for port %d, err %d\n",
830 				     i + 1, err);
831 			goto err_alloc;
832 		}
833 
834 		dev->port[i].cnts.set_id =
835 			MLX5_GET(alloc_q_counter_out, out, counter_set_id);
836 	}
837 	return 0;
838 
839 err_alloc:
840 	mlx5_ib_dealloc_counters(dev);
841 	return err;
842 }
843 
844 static int read_flow_counters(struct ib_device *ibdev,
845 			      struct mlx5_read_counters_attr *read_attr)
846 {
847 	struct mlx5_fc *fc = read_attr->hw_cntrs_hndl;
848 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
849 
850 	return mlx5_fc_query(dev->mdev, fc,
851 			     &read_attr->out[IB_COUNTER_PACKETS],
852 			     &read_attr->out[IB_COUNTER_BYTES]);
853 }
854 
855 /* flow counters currently expose two counters packets and bytes */
856 #define FLOW_COUNTERS_NUM 2
857 static int counters_set_description(
858 	struct ib_counters *counters, enum mlx5_ib_counters_type counters_type,
859 	struct mlx5_ib_flow_counters_desc *desc_data, u32 ncounters)
860 {
861 	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
862 	u32 cntrs_max_index = 0;
863 	int i;
864 
865 	if (counters_type != MLX5_IB_COUNTERS_FLOW)
866 		return -EINVAL;
867 
868 	/* init the fields for the object */
869 	mcounters->type = counters_type;
870 	mcounters->read_counters = read_flow_counters;
871 	mcounters->counters_num = FLOW_COUNTERS_NUM;
872 	mcounters->ncounters = ncounters;
873 	/* each counter entry have both description and index pair */
874 	for (i = 0; i < ncounters; i++) {
875 		if (desc_data[i].description > IB_COUNTER_BYTES)
876 			return -EINVAL;
877 
878 		if (cntrs_max_index <= desc_data[i].index)
879 			cntrs_max_index = desc_data[i].index + 1;
880 	}
881 
882 	mutex_lock(&mcounters->mcntrs_mutex);
883 	mcounters->counters_data = desc_data;
884 	mcounters->cntrs_max_index = cntrs_max_index;
885 	mutex_unlock(&mcounters->mcntrs_mutex);
886 
887 	return 0;
888 }
889 
890 #define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2))
891 int mlx5_ib_flow_counters_set_data(struct ib_counters *ibcounters,
892 				   struct mlx5_ib_create_flow *ucmd)
893 {
894 	struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters);
895 	struct mlx5_ib_flow_counters_data *cntrs_data = NULL;
896 	struct mlx5_ib_flow_counters_desc *desc_data = NULL;
897 	bool hw_hndl = false;
898 	int ret = 0;
899 
900 	if (ucmd && ucmd->ncounters_data != 0) {
901 		cntrs_data = ucmd->data;
902 		if (cntrs_data->ncounters > MAX_COUNTERS_NUM)
903 			return -EINVAL;
904 
905 		desc_data = kcalloc(cntrs_data->ncounters,
906 				    sizeof(*desc_data),
907 				    GFP_KERNEL);
908 		if (!desc_data)
909 			return  -ENOMEM;
910 
911 		if (copy_from_user(desc_data,
912 				   u64_to_user_ptr(cntrs_data->counters_data),
913 				   sizeof(*desc_data) * cntrs_data->ncounters)) {
914 			ret = -EFAULT;
915 			goto free;
916 		}
917 	}
918 
919 	if (!mcounters->hw_cntrs_hndl) {
920 		mcounters->hw_cntrs_hndl = mlx5_fc_create(
921 			to_mdev(ibcounters->device)->mdev, false);
922 		if (IS_ERR(mcounters->hw_cntrs_hndl)) {
923 			ret = PTR_ERR(mcounters->hw_cntrs_hndl);
924 			goto free;
925 		}
926 		hw_hndl = true;
927 	}
928 
929 	if (desc_data) {
930 		/* counters already bound to at least one flow */
931 		if (mcounters->cntrs_max_index) {
932 			ret = -EINVAL;
933 			goto free_hndl;
934 		}
935 
936 		ret = counters_set_description(ibcounters,
937 					       MLX5_IB_COUNTERS_FLOW,
938 					       desc_data,
939 					       cntrs_data->ncounters);
940 		if (ret)
941 			goto free_hndl;
942 
943 	} else if (!mcounters->cntrs_max_index) {
944 		/* counters not bound yet, must have udata passed */
945 		ret = -EINVAL;
946 		goto free_hndl;
947 	}
948 
949 	return 0;
950 
951 free_hndl:
952 	if (hw_hndl) {
953 		mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev,
954 				mcounters->hw_cntrs_hndl);
955 		mcounters->hw_cntrs_hndl = NULL;
956 	}
957 free:
958 	kfree(desc_data);
959 	return ret;
960 }
961 
962 void mlx5_ib_counters_clear_description(struct ib_counters *counters)
963 {
964 	struct mlx5_ib_mcounters *mcounters;
965 
966 	if (!counters || atomic_read(&counters->usecnt) != 1)
967 		return;
968 
969 	mcounters = to_mcounters(counters);
970 
971 	mutex_lock(&mcounters->mcntrs_mutex);
972 	kfree(mcounters->counters_data);
973 	mcounters->counters_data = NULL;
974 	mcounters->cntrs_max_index = 0;
975 	mutex_unlock(&mcounters->mcntrs_mutex);
976 }
977 
978 static int mlx5_ib_modify_stat(struct ib_device *device, u32 port,
979 			       unsigned int index, bool enable)
980 {
981 	struct mlx5_ib_dev *dev = to_mdev(device);
982 	struct mlx5_ib_counters *cnts;
983 	struct mlx5_ib_op_fc *opfc;
984 	u32 num_hw_counters, type;
985 	int ret;
986 
987 	cnts = &dev->port[port - 1].cnts;
988 	num_hw_counters = cnts->num_q_counters + cnts->num_cong_counters +
989 		cnts->num_ext_ppcnt_counters;
990 	if (index < num_hw_counters ||
991 	    index >= (num_hw_counters + cnts->num_op_counters))
992 		return -EINVAL;
993 
994 	if (!(cnts->descs[index].flags & IB_STAT_FLAG_OPTIONAL))
995 		return -EINVAL;
996 
997 	type = *(u32 *)cnts->descs[index].priv;
998 	if (type >= MLX5_IB_OPCOUNTER_MAX)
999 		return -EINVAL;
1000 
1001 	opfc = &cnts->opfcs[type];
1002 
1003 	if (enable) {
1004 		if (opfc->fc)
1005 			return -EEXIST;
1006 
1007 		opfc->fc = mlx5_fc_create(dev->mdev, false);
1008 		if (IS_ERR(opfc->fc))
1009 			return PTR_ERR(opfc->fc);
1010 
1011 		ret = mlx5_ib_fs_add_op_fc(dev, port, opfc, type);
1012 		if (ret) {
1013 			mlx5_fc_destroy(dev->mdev, opfc->fc);
1014 			opfc->fc = NULL;
1015 		}
1016 		return ret;
1017 	}
1018 
1019 	if (!opfc->fc)
1020 		return -EINVAL;
1021 
1022 	mlx5_ib_fs_remove_op_fc(dev, opfc, type);
1023 	mlx5_fc_destroy(dev->mdev, opfc->fc);
1024 	opfc->fc = NULL;
1025 	return 0;
1026 }
1027 
1028 static const struct ib_device_ops hw_stats_ops = {
1029 	.alloc_hw_port_stats = mlx5_ib_alloc_hw_port_stats,
1030 	.get_hw_stats = mlx5_ib_get_hw_stats,
1031 	.counter_bind_qp = mlx5_ib_counter_bind_qp,
1032 	.counter_unbind_qp = mlx5_ib_counter_unbind_qp,
1033 	.counter_dealloc = mlx5_ib_counter_dealloc,
1034 	.counter_alloc_stats = mlx5_ib_counter_alloc_stats,
1035 	.counter_update_stats = mlx5_ib_counter_update_stats,
1036 	.modify_hw_stat = IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) ?
1037 			  mlx5_ib_modify_stat : NULL,
1038 };
1039 
1040 static const struct ib_device_ops hw_switchdev_vport_op = {
1041 	.alloc_hw_port_stats = mlx5_ib_alloc_hw_port_stats,
1042 };
1043 
1044 static const struct ib_device_ops hw_switchdev_stats_ops = {
1045 	.alloc_hw_device_stats = mlx5_ib_alloc_hw_device_stats,
1046 	.get_hw_stats = mlx5_ib_get_hw_stats,
1047 	.counter_bind_qp = mlx5_ib_counter_bind_qp,
1048 	.counter_unbind_qp = mlx5_ib_counter_unbind_qp,
1049 	.counter_dealloc = mlx5_ib_counter_dealloc,
1050 	.counter_alloc_stats = mlx5_ib_counter_alloc_stats,
1051 	.counter_update_stats = mlx5_ib_counter_update_stats,
1052 };
1053 
1054 static const struct ib_device_ops counters_ops = {
1055 	.create_counters = mlx5_ib_create_counters,
1056 	.destroy_counters = mlx5_ib_destroy_counters,
1057 	.read_counters = mlx5_ib_read_counters,
1058 
1059 	INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs),
1060 };
1061 
1062 int mlx5_ib_counters_init(struct mlx5_ib_dev *dev)
1063 {
1064 	ib_set_device_ops(&dev->ib_dev, &counters_ops);
1065 
1066 	if (!MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
1067 		return 0;
1068 
1069 	if (is_mdev_switchdev_mode(dev->mdev)) {
1070 		ib_set_device_ops(&dev->ib_dev, &hw_switchdev_stats_ops);
1071 		if (vport_qcounters_supported(dev))
1072 			ib_set_device_ops(&dev->ib_dev, &hw_switchdev_vport_op);
1073 	} else
1074 		ib_set_device_ops(&dev->ib_dev, &hw_stats_ops);
1075 	return mlx5_ib_alloc_counters(dev);
1076 }
1077 
1078 void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev)
1079 {
1080 	if (!MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
1081 		return;
1082 
1083 	mlx5_ib_dealloc_counters(dev);
1084 }
1085