xref: /freebsd/sys/dev/mlx5/mlx5_ib/mlx5_ib_cong.c (revision 35c0a8c449fd2b7f75029ebed5e10852240f0865)
1 /*-
2  * Copyright (c) 2013-2020, Mellanox Technologies, Ltd.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 #include "opt_rss.h"
27 #include "opt_ratelimit.h"
28 
29 #include <dev/mlx5/mlx5_ib/mlx5_ib.h>
30 #include <dev/mlx5/cmd.h>
31 
32 static const char *mlx5_ib_cong_params_desc[] = {
33 	MLX5_IB_CONG_PARAMS(MLX5_IB_STATS_DESC)
34 };
35 
36 static const char *mlx5_ib_cong_status_desc[] = {
37 	MLX5_IB_CONG_STATUS(MLX5_IB_STATS_DESC)
38 };
39 
40 static const char *mlx5_ib_cong_stats_desc[] = {
41 	MLX5_IB_CONG_STATS(MLX5_IB_STATS_DESC)
42 };
43 
44 #define	MLX5_IB_INDEX(field) ( \
45     (__offsetof(struct mlx5_ib_congestion, field) - \
46      __offsetof(struct mlx5_ib_congestion, arg[0])) / sizeof(u64))
47 #define	MLX5_IB_FLD_MAX(type, field) ((1ULL << __mlx5_bit_sz(type, field)) - 1ULL)
48 #define	MLX5_IB_SET_CLIPPED(type, ptr, field, var) do { \
49   /* rangecheck */					\
50   if ((var) > MLX5_IB_FLD_MAX(type, field))		\
51 	(var) = MLX5_IB_FLD_MAX(type, field);		\
52   /* set value */					\
53   MLX5_SET(type, ptr, field, var);			\
54 } while (0)
55 
56 #define	CONG_LOCK(dev) sx_xlock(&(dev)->congestion.lock)
57 #define	CONG_UNLOCK(dev) sx_xunlock(&(dev)->congestion.lock)
58 #define	CONG_LOCKED(dev) sx_xlocked(&(dev)->congestion.lock)
59 
60 #define	MLX5_IB_RP_CLAMP_TGT_RATE_ATTR			BIT(1)
61 #define	MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR		BIT(2)
62 #define	MLX5_IB_RP_TIME_RESET_ATTR			BIT(3)
63 #define	MLX5_IB_RP_BYTE_RESET_ATTR			BIT(4)
64 #define	MLX5_IB_RP_THRESHOLD_ATTR			BIT(5)
65 #define	MLX5_IB_RP_AI_RATE_ATTR				BIT(7)
66 #define	MLX5_IB_RP_HAI_RATE_ATTR			BIT(8)
67 #define	MLX5_IB_RP_MIN_DEC_FAC_ATTR			BIT(9)
68 #define	MLX5_IB_RP_MIN_RATE_ATTR			BIT(10)
69 #define	MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR	BIT(11)
70 #define	MLX5_IB_RP_DCE_TCP_G_ATTR			BIT(12)
71 #define	MLX5_IB_RP_DCE_TCP_RTT_ATTR			BIT(13)
72 #define	MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR	BIT(14)
73 #define	MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR		BIT(15)
74 #define	MLX5_IB_RP_GD_ATTR				BIT(16)
75 
76 #define	MLX5_IB_NP_CNP_DSCP_ATTR			BIT(3)
77 #define	MLX5_IB_NP_CNP_PRIO_MODE_ATTR			BIT(4)
78 
79 enum mlx5_ib_cong_node_type {
80 	MLX5_IB_RROCE_ECN_RP = 1,
81 	MLX5_IB_RROCE_ECN_NP = 2,
82 };
83 
84 static enum mlx5_ib_cong_node_type
85 mlx5_ib_param_to_node(u32 index)
86 {
87 
88 	if (index >= MLX5_IB_INDEX(rp_clamp_tgt_rate) &&
89 	    index <= MLX5_IB_INDEX(rp_gd))
90 		return MLX5_IB_RROCE_ECN_RP;
91 	else
92 		return MLX5_IB_RROCE_ECN_NP;
93 }
94 
95 static u64
96 mlx5_get_cc_param_val(void *field, u32 index)
97 {
98 
99 	switch (index) {
100 	case MLX5_IB_INDEX(rp_clamp_tgt_rate):
101 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
102 				clamp_tgt_rate);
103 	case MLX5_IB_INDEX(rp_clamp_tgt_rate_ati):
104 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
105 				clamp_tgt_rate_after_time_inc);
106 	case MLX5_IB_INDEX(rp_time_reset):
107 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
108 				rpg_time_reset);
109 	case MLX5_IB_INDEX(rp_byte_reset):
110 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
111 				rpg_byte_reset);
112 	case MLX5_IB_INDEX(rp_threshold):
113 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
114 				rpg_threshold);
115 	case MLX5_IB_INDEX(rp_ai_rate):
116 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
117 				rpg_ai_rate);
118 	case MLX5_IB_INDEX(rp_hai_rate):
119 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
120 				rpg_hai_rate);
121 	case MLX5_IB_INDEX(rp_min_dec_fac):
122 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
123 				rpg_min_dec_fac);
124 	case MLX5_IB_INDEX(rp_min_rate):
125 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
126 				rpg_min_rate);
127 	case MLX5_IB_INDEX(rp_rate_to_set_on_first_cnp):
128 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
129 				rate_to_set_on_first_cnp);
130 	case MLX5_IB_INDEX(rp_dce_tcp_g):
131 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
132 				dce_tcp_g);
133 	case MLX5_IB_INDEX(rp_dce_tcp_rtt):
134 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
135 				dce_tcp_rtt);
136 	case MLX5_IB_INDEX(rp_rate_reduce_monitor_period):
137 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
138 				rate_reduce_monitor_period);
139 	case MLX5_IB_INDEX(rp_initial_alpha_value):
140 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
141 				initial_alpha_value);
142 	case MLX5_IB_INDEX(rp_gd):
143 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
144 				rpg_gd);
145 	case MLX5_IB_INDEX(np_cnp_dscp):
146 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
147 				cnp_dscp);
148 	case MLX5_IB_INDEX(np_cnp_prio_mode):
149 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
150 				cnp_prio_mode);
151 	case MLX5_IB_INDEX(np_cnp_prio):
152 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
153 				cnp_802p_prio);
154 	default:
155 		return 0;
156 	}
157 }
158 
159 static void
160 mlx5_ib_set_cc_param_mask_val(void *field, u32 index,
161     u64 var, u32 *attr_mask)
162 {
163 
164 	switch (index) {
165 	case MLX5_IB_INDEX(rp_clamp_tgt_rate):
166 		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATTR;
167 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
168 			 clamp_tgt_rate, var);
169 		break;
170 	case MLX5_IB_INDEX(rp_clamp_tgt_rate_ati):
171 		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR;
172 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
173 			 clamp_tgt_rate_after_time_inc, var);
174 		break;
175 	case MLX5_IB_INDEX(rp_time_reset):
176 		*attr_mask |= MLX5_IB_RP_TIME_RESET_ATTR;
177 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
178 			 rpg_time_reset, var);
179 		break;
180 	case MLX5_IB_INDEX(rp_byte_reset):
181 		*attr_mask |= MLX5_IB_RP_BYTE_RESET_ATTR;
182 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
183 			 rpg_byte_reset, var);
184 		break;
185 	case MLX5_IB_INDEX(rp_threshold):
186 		*attr_mask |= MLX5_IB_RP_THRESHOLD_ATTR;
187 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
188 			 rpg_threshold, var);
189 		break;
190 	case MLX5_IB_INDEX(rp_ai_rate):
191 		*attr_mask |= MLX5_IB_RP_AI_RATE_ATTR;
192 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
193 			 rpg_ai_rate, var);
194 		break;
195 	case MLX5_IB_INDEX(rp_hai_rate):
196 		*attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR;
197 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
198 			 rpg_hai_rate, var);
199 		break;
200 	case MLX5_IB_INDEX(rp_min_dec_fac):
201 		*attr_mask |= MLX5_IB_RP_MIN_DEC_FAC_ATTR;
202 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
203 			 rpg_min_dec_fac, var);
204 		break;
205 	case MLX5_IB_INDEX(rp_min_rate):
206 		*attr_mask |= MLX5_IB_RP_MIN_RATE_ATTR;
207 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
208 			 rpg_min_rate, var);
209 		break;
210 	case MLX5_IB_INDEX(rp_rate_to_set_on_first_cnp):
211 		*attr_mask |= MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR;
212 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
213 			 rate_to_set_on_first_cnp, var);
214 		break;
215 	case MLX5_IB_INDEX(rp_dce_tcp_g):
216 		*attr_mask |= MLX5_IB_RP_DCE_TCP_G_ATTR;
217 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
218 			 dce_tcp_g, var);
219 		break;
220 	case MLX5_IB_INDEX(rp_dce_tcp_rtt):
221 		*attr_mask |= MLX5_IB_RP_DCE_TCP_RTT_ATTR;
222 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
223 			 dce_tcp_rtt, var);
224 		break;
225 	case MLX5_IB_INDEX(rp_rate_reduce_monitor_period):
226 		*attr_mask |= MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR;
227 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
228 			 rate_reduce_monitor_period, var);
229 		break;
230 	case MLX5_IB_INDEX(rp_initial_alpha_value):
231 		*attr_mask |= MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR;
232 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
233 			 initial_alpha_value, var);
234 		break;
235 	case MLX5_IB_INDEX(rp_gd):
236 		*attr_mask |= MLX5_IB_RP_GD_ATTR;
237 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
238 			 rpg_gd, var);
239 		break;
240 	case MLX5_IB_INDEX(np_cnp_dscp):
241 		*attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR;
242 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_dscp, var);
243 		break;
244 	case MLX5_IB_INDEX(np_cnp_prio_mode):
245 		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
246 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_prio_mode, var);
247 		break;
248 	case MLX5_IB_INDEX(np_cnp_prio):
249 		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
250 		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0);
251 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var);
252 		break;
253 	default:
254 		break;
255 	}
256 }
257 
258 static int
259 mlx5_ib_get_all_cc_params(struct mlx5_ib_dev *dev)
260 {
261 	int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out);
262 	enum mlx5_ib_cong_node_type node = 0;
263 	void *out;
264 	void *field;
265 	u32 x;
266 	int err = 0;
267 
268 	out = kzalloc(outlen, GFP_KERNEL);
269 	if (!out)
270 		return -ENOMEM;
271 
272 	/* get the current values */
273 	for (x = 0; x != MLX5_IB_CONG_PARAMS_NUM; x++) {
274 		if (node != mlx5_ib_param_to_node(x)) {
275 			node = mlx5_ib_param_to_node(x);
276 
277 			err = mlx5_cmd_query_cong_params(dev->mdev, node, out, outlen);
278 			if (err)
279 				break;
280 		}
281 		field = MLX5_ADDR_OF(query_cong_params_out, out, congestion_parameters);
282 		dev->congestion.arg[x] = mlx5_get_cc_param_val(field, x);
283 	}
284 	kfree(out);
285 	return err;
286 }
287 
288 static int
289 mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u32 index, u64 var)
290 {
291 	int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in);
292 	enum mlx5_ib_cong_node_type node;
293 	u32 attr_mask = 0;
294 	void *field;
295 	void *in;
296 	int err;
297 
298 	in = kzalloc(inlen, GFP_KERNEL);
299 	if (!in)
300 		return -ENOMEM;
301 
302 	MLX5_SET(modify_cong_params_in, in, opcode,
303 		 MLX5_CMD_OP_MODIFY_CONG_PARAMS);
304 
305 	node = mlx5_ib_param_to_node(index);
306 	MLX5_SET(modify_cong_params_in, in, cong_protocol, node);
307 
308 	field = MLX5_ADDR_OF(modify_cong_params_in, in, congestion_parameters);
309 	mlx5_ib_set_cc_param_mask_val(field, index, var, &attr_mask);
310 
311 	field = MLX5_ADDR_OF(modify_cong_params_in, in, field_select);
312 	MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp,
313 		 attr_mask);
314 
315 	err = mlx5_cmd_modify_cong_params(dev->mdev, in, inlen);
316 	kfree(in);
317 
318 	return err;
319 }
320 
321 static int
322 mlx5_ib_cong_params_handler(SYSCTL_HANDLER_ARGS)
323 {
324 	struct mlx5_ib_dev *dev = arg1;
325 	u64 value;
326 	int error;
327 
328 	CONG_LOCK(dev);
329 	value = dev->congestion.arg[arg2];
330 	if (req != NULL) {
331 		error = sysctl_handle_64(oidp, &value, 0, req);
332 		if (error || req->newptr == NULL ||
333 		    value == dev->congestion.arg[arg2])
334 			goto done;
335 
336 		/* assign new value */
337 		dev->congestion.arg[arg2] = value;
338 	} else {
339 		error = 0;
340 	}
341 	if (!MLX5_CAP_GEN(dev->mdev, cc_modify_allowed))
342 		error = EPERM;
343 	else {
344 		error = -mlx5_ib_set_cc_params(dev, MLX5_IB_INDEX(arg[arg2]),
345 		    dev->congestion.arg[arg2]);
346 	}
347 done:
348 	CONG_UNLOCK(dev);
349 
350 	return (error);
351 }
352 
353 static int
354 mlx5_ib_get_all_cc_status(struct mlx5_ib_dev *dev)
355 {
356 	const int outlen = MLX5_ST_SZ_BYTES(query_cong_status_out);
357 	uint32_t out[MLX5_ST_SZ_DW(query_cong_status_out)] = {};
358 	int error;
359 
360 #define	MLX5_IB_CONG_STATUS_READ(a,b,c,d,e,node,prio,field) do { \
361 	error = mlx5_cmd_query_cong_status(dev->mdev, node, prio, out, outlen); \
362 	if (error)							\
363 		goto done;						\
364 	dev->congestion.c = MLX5_GET(query_cong_status_out, out, field); \
365 } while (0);
366 
367 	MLX5_IB_CONG_STATUS(MLX5_IB_CONG_STATUS_READ);
368 done:
369 	return (error);
370 }
371 
372 static int
373 mlx5_ib_cong_status_handler(SYSCTL_HANDLER_ARGS)
374 {
375 	const int inlen = MLX5_ST_SZ_BYTES(modify_cong_status_in);
376 	uint32_t in[MLX5_ST_SZ_DW(modify_cong_status_in)] = {};
377 	struct mlx5_ib_dev *dev = arg1;
378 	u64 value;
379 	int error;
380 
381 	CONG_LOCK(dev);
382 	value = dev->congestion.arg[arg2];
383 	if (req != NULL) {
384 		error = sysctl_handle_64(oidp, &value, 0, req);
385 		/* convert value into a boolean */
386 		value = value ? 1 : 0;
387 		if (error || req->newptr == NULL ||
388 		    value == dev->congestion.arg[arg2])
389 			goto done;
390 
391 		/* assign new binary value */
392 		dev->congestion.arg[arg2] = value;
393 	} else {
394 		error = 0;
395 	}
396 	if (!MLX5_CAP_GEN(dev->mdev, cc_modify_allowed))
397 		error = EPERM;
398 	else switch (arg2) {
399 #define	MLX5_IB_CONG_STATUS_WRITE(a,b,c,d,e,node,prio,field)	\
400 	case MLX5_IB_INDEX(c):					\
401 		MLX5_SET(modify_cong_status_in, in, opcode,	\
402 		    MLX5_CMD_OP_MODIFY_CONG_STATUS);		\
403 		MLX5_SET(modify_cong_status_in, in, priority, prio); \
404 		MLX5_SET(modify_cong_status_in, in, cong_protocol, node); \
405 		MLX5_SET(modify_cong_status_in, in, field, value); \
406 		error = -mlx5_cmd_modify_cong_status(dev->mdev, in, inlen); \
407 		break;
408 	MLX5_IB_CONG_STATUS(MLX5_IB_CONG_STATUS_WRITE)
409 	default:
410 		error = EINVAL;
411 		break;
412 	}
413 done:
414 	CONG_UNLOCK(dev);
415 
416 	return (error);
417 }
418 
419 #define	MLX5_GET_UNALIGNED_64(t,p,f) \
420     (((u64)MLX5_GET(t,p,f##_high) << 32) | MLX5_GET(t,p,f##_low))
421 
422 static void
423 mlx5_ib_read_cong_stats(struct work_struct *work)
424 {
425 	struct mlx5_ib_dev *dev =
426 	    container_of(work, struct mlx5_ib_dev, congestion.dwork.work);
427 	const int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
428 	void *out;
429 
430 	out = kzalloc(outlen, GFP_KERNEL);
431 	if (!out)
432 		goto done;
433 
434 	CONG_LOCK(dev);
435 	if (mlx5_cmd_query_cong_counter(dev->mdev, 0, out, outlen))
436 		memset(out, 0, outlen);
437 
438 	dev->congestion.syndrome =
439 	    MLX5_GET(query_cong_statistics_out, out, syndrome);
440 	dev->congestion.rp_cur_flows =
441 	    MLX5_GET(query_cong_statistics_out, out, rp_cur_flows);
442 	dev->congestion.sum_flows =
443 	    MLX5_GET(query_cong_statistics_out, out, sum_flows);
444 	dev->congestion.rp_cnp_ignored =
445 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, rp_cnp_ignored);
446 	dev->congestion.rp_cnp_handled =
447 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, rp_cnp_handled);
448 	dev->congestion.time_stamp =
449 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, time_stamp);
450 	dev->congestion.accumulators_period =
451 	    MLX5_GET(query_cong_statistics_out, out, accumulators_period);
452 	dev->congestion.np_ecn_marked_roce_packets =
453 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, np_ecn_marked_roce_packets);
454 	dev->congestion.np_cnp_sent =
455 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, np_cnp_sent);
456 
457 	CONG_UNLOCK(dev);
458 	kfree(out);
459 
460 done:
461 	schedule_delayed_work(&dev->congestion.dwork, hz);
462 }
463 
464 void
465 mlx5_ib_cleanup_congestion(struct mlx5_ib_dev *dev)
466 {
467 
468 	while (cancel_delayed_work_sync(&dev->congestion.dwork))
469 		;
470 	sysctl_ctx_free(&dev->congestion.ctx);
471 	sx_destroy(&dev->congestion.lock);
472 }
473 
474 int
475 mlx5_ib_init_congestion(struct mlx5_ib_dev *dev)
476 {
477 	struct sysctl_ctx_list *ctx;
478 	struct sysctl_oid *parent;
479 	struct sysctl_oid *node;
480 	int err;
481 	u32 x;
482 
483 	ctx = &dev->congestion.ctx;
484 	sysctl_ctx_init(ctx);
485 	sx_init(&dev->congestion.lock, "mlx5ibcong");
486 	INIT_DELAYED_WORK(&dev->congestion.dwork, mlx5_ib_read_cong_stats);
487 
488 	if (!MLX5_CAP_GEN(dev->mdev, cc_query_allowed))
489 		return (0);
490 
491 	err = mlx5_ib_get_all_cc_params(dev);
492 	if (err)
493 		return (err);
494 
495 	err = mlx5_ib_get_all_cc_status(dev);
496 	if (err)
497 		return (err);
498 
499 	parent = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(dev->ib_dev.dev.kobj.oidp),
500 	    OID_AUTO, "cong", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
501 	    "Congestion control");
502 	if (parent == NULL)
503 		return (-ENOMEM);
504 
505 	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(parent),
506 	    OID_AUTO, "conf", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
507 	    "Configuration");
508 	if (node == NULL) {
509 		sysctl_ctx_free(&dev->congestion.ctx);
510 		return (-ENOMEM);
511 	}
512 
513 	for (x = 0; x != MLX5_IB_CONG_PARAMS_NUM; x++) {
514 		SYSCTL_ADD_PROC(ctx,
515 		    SYSCTL_CHILDREN(node), OID_AUTO,
516 		    mlx5_ib_cong_params_desc[2 * x],
517 		    CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
518 		    dev, x, &mlx5_ib_cong_params_handler, "QU",
519 		    mlx5_ib_cong_params_desc[2 * x + 1]);
520 	}
521 
522 	for (x = 0; x != MLX5_IB_CONG_STATUS_NUM; x++) {
523 		SYSCTL_ADD_PROC(ctx,
524 		    SYSCTL_CHILDREN(node), OID_AUTO,
525 		    mlx5_ib_cong_status_desc[2 * x],
526 		    CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
527 		    dev, x + MLX5_IB_CONG_PARAMS_NUM + MLX5_IB_CONG_STATS_NUM,
528 		    &mlx5_ib_cong_status_handler, "QU",
529 		    mlx5_ib_cong_status_desc[2 * x + 1]);
530 	}
531 
532 	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(parent),
533 	    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
534 	    "Statistics");
535 	if (node == NULL) {
536 		sysctl_ctx_free(&dev->congestion.ctx);
537 		return (-ENOMEM);
538 	}
539 
540 	for (x = 0; x != MLX5_IB_CONG_STATS_NUM; x++) {
541 		/* read-only SYSCTLs */
542 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO,
543 		    mlx5_ib_cong_stats_desc[2 * x],
544 		    CTLFLAG_RD | CTLFLAG_MPSAFE,
545 		    &dev->congestion.arg[x + MLX5_IB_CONG_PARAMS_NUM],
546 		    0, mlx5_ib_cong_stats_desc[2 * x + 1]);
547 	}
548 	schedule_delayed_work(&dev->congestion.dwork, hz);
549 	return (0);
550 }
551