xref: /freebsd/sys/dev/mlx5/mlx5_ib/mlx5_ib_cong.c (revision e6bfd18d21b225af6a0ed67ceeaf1293b7b9eba5)
1 /*-
2  * Copyright (c) 2013-2020, Mellanox Technologies, Ltd.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "opt_rss.h"
29 #include "opt_ratelimit.h"
30 
31 #include <dev/mlx5/mlx5_ib/mlx5_ib.h>
32 #include <dev/mlx5/cmd.h>
33 
34 static const char *mlx5_ib_cong_params_desc[] = {
35 	MLX5_IB_CONG_PARAMS(MLX5_IB_STATS_DESC)
36 };
37 
38 static const char *mlx5_ib_cong_status_desc[] = {
39 	MLX5_IB_CONG_STATUS(MLX5_IB_STATS_DESC)
40 };
41 
42 static const char *mlx5_ib_cong_stats_desc[] = {
43 	MLX5_IB_CONG_STATS(MLX5_IB_STATS_DESC)
44 };
45 
46 #define	MLX5_IB_INDEX(field) ( \
47     (__offsetof(struct mlx5_ib_congestion, field) - \
48      __offsetof(struct mlx5_ib_congestion, arg[0])) / sizeof(u64))
49 #define	MLX5_IB_FLD_MAX(type, field) ((1ULL << __mlx5_bit_sz(type, field)) - 1ULL)
50 #define	MLX5_IB_SET_CLIPPED(type, ptr, field, var) do { \
51   /* rangecheck */					\
52   if ((var) > MLX5_IB_FLD_MAX(type, field))		\
53 	(var) = MLX5_IB_FLD_MAX(type, field);		\
54   /* set value */					\
55   MLX5_SET(type, ptr, field, var);			\
56 } while (0)
57 
58 #define	CONG_LOCK(dev) sx_xlock(&(dev)->congestion.lock)
59 #define	CONG_UNLOCK(dev) sx_xunlock(&(dev)->congestion.lock)
60 #define	CONG_LOCKED(dev) sx_xlocked(&(dev)->congestion.lock)
61 
62 #define	MLX5_IB_RP_CLAMP_TGT_RATE_ATTR			BIT(1)
63 #define	MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR		BIT(2)
64 #define	MLX5_IB_RP_TIME_RESET_ATTR			BIT(3)
65 #define	MLX5_IB_RP_BYTE_RESET_ATTR			BIT(4)
66 #define	MLX5_IB_RP_THRESHOLD_ATTR			BIT(5)
67 #define	MLX5_IB_RP_AI_RATE_ATTR				BIT(7)
68 #define	MLX5_IB_RP_HAI_RATE_ATTR			BIT(8)
69 #define	MLX5_IB_RP_MIN_DEC_FAC_ATTR			BIT(9)
70 #define	MLX5_IB_RP_MIN_RATE_ATTR			BIT(10)
71 #define	MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR	BIT(11)
72 #define	MLX5_IB_RP_DCE_TCP_G_ATTR			BIT(12)
73 #define	MLX5_IB_RP_DCE_TCP_RTT_ATTR			BIT(13)
74 #define	MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR	BIT(14)
75 #define	MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR		BIT(15)
76 #define	MLX5_IB_RP_GD_ATTR				BIT(16)
77 
78 #define	MLX5_IB_NP_CNP_DSCP_ATTR			BIT(3)
79 #define	MLX5_IB_NP_CNP_PRIO_MODE_ATTR			BIT(4)
80 
81 enum mlx5_ib_cong_node_type {
82 	MLX5_IB_RROCE_ECN_RP = 1,
83 	MLX5_IB_RROCE_ECN_NP = 2,
84 };
85 
86 static enum mlx5_ib_cong_node_type
87 mlx5_ib_param_to_node(u32 index)
88 {
89 
90 	if (index >= MLX5_IB_INDEX(rp_clamp_tgt_rate) &&
91 	    index <= MLX5_IB_INDEX(rp_gd))
92 		return MLX5_IB_RROCE_ECN_RP;
93 	else
94 		return MLX5_IB_RROCE_ECN_NP;
95 }
96 
97 static u64
98 mlx5_get_cc_param_val(void *field, u32 index)
99 {
100 
101 	switch (index) {
102 	case MLX5_IB_INDEX(rp_clamp_tgt_rate):
103 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
104 				clamp_tgt_rate);
105 	case MLX5_IB_INDEX(rp_clamp_tgt_rate_ati):
106 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
107 				clamp_tgt_rate_after_time_inc);
108 	case MLX5_IB_INDEX(rp_time_reset):
109 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
110 				rpg_time_reset);
111 	case MLX5_IB_INDEX(rp_byte_reset):
112 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
113 				rpg_byte_reset);
114 	case MLX5_IB_INDEX(rp_threshold):
115 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
116 				rpg_threshold);
117 	case MLX5_IB_INDEX(rp_ai_rate):
118 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
119 				rpg_ai_rate);
120 	case MLX5_IB_INDEX(rp_hai_rate):
121 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
122 				rpg_hai_rate);
123 	case MLX5_IB_INDEX(rp_min_dec_fac):
124 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
125 				rpg_min_dec_fac);
126 	case MLX5_IB_INDEX(rp_min_rate):
127 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
128 				rpg_min_rate);
129 	case MLX5_IB_INDEX(rp_rate_to_set_on_first_cnp):
130 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
131 				rate_to_set_on_first_cnp);
132 	case MLX5_IB_INDEX(rp_dce_tcp_g):
133 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
134 				dce_tcp_g);
135 	case MLX5_IB_INDEX(rp_dce_tcp_rtt):
136 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
137 				dce_tcp_rtt);
138 	case MLX5_IB_INDEX(rp_rate_reduce_monitor_period):
139 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
140 				rate_reduce_monitor_period);
141 	case MLX5_IB_INDEX(rp_initial_alpha_value):
142 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
143 				initial_alpha_value);
144 	case MLX5_IB_INDEX(rp_gd):
145 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
146 				rpg_gd);
147 	case MLX5_IB_INDEX(np_cnp_dscp):
148 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
149 				cnp_dscp);
150 	case MLX5_IB_INDEX(np_cnp_prio_mode):
151 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
152 				cnp_prio_mode);
153 	case MLX5_IB_INDEX(np_cnp_prio):
154 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
155 				cnp_802p_prio);
156 	default:
157 		return 0;
158 	}
159 }
160 
161 static void
162 mlx5_ib_set_cc_param_mask_val(void *field, u32 index,
163     u64 var, u32 *attr_mask)
164 {
165 
166 	switch (index) {
167 	case MLX5_IB_INDEX(rp_clamp_tgt_rate):
168 		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATTR;
169 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
170 			 clamp_tgt_rate, var);
171 		break;
172 	case MLX5_IB_INDEX(rp_clamp_tgt_rate_ati):
173 		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR;
174 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
175 			 clamp_tgt_rate_after_time_inc, var);
176 		break;
177 	case MLX5_IB_INDEX(rp_time_reset):
178 		*attr_mask |= MLX5_IB_RP_TIME_RESET_ATTR;
179 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
180 			 rpg_time_reset, var);
181 		break;
182 	case MLX5_IB_INDEX(rp_byte_reset):
183 		*attr_mask |= MLX5_IB_RP_BYTE_RESET_ATTR;
184 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
185 			 rpg_byte_reset, var);
186 		break;
187 	case MLX5_IB_INDEX(rp_threshold):
188 		*attr_mask |= MLX5_IB_RP_THRESHOLD_ATTR;
189 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
190 			 rpg_threshold, var);
191 		break;
192 	case MLX5_IB_INDEX(rp_ai_rate):
193 		*attr_mask |= MLX5_IB_RP_AI_RATE_ATTR;
194 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
195 			 rpg_ai_rate, var);
196 		break;
197 	case MLX5_IB_INDEX(rp_hai_rate):
198 		*attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR;
199 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
200 			 rpg_hai_rate, var);
201 		break;
202 	case MLX5_IB_INDEX(rp_min_dec_fac):
203 		*attr_mask |= MLX5_IB_RP_MIN_DEC_FAC_ATTR;
204 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
205 			 rpg_min_dec_fac, var);
206 		break;
207 	case MLX5_IB_INDEX(rp_min_rate):
208 		*attr_mask |= MLX5_IB_RP_MIN_RATE_ATTR;
209 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
210 			 rpg_min_rate, var);
211 		break;
212 	case MLX5_IB_INDEX(rp_rate_to_set_on_first_cnp):
213 		*attr_mask |= MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR;
214 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
215 			 rate_to_set_on_first_cnp, var);
216 		break;
217 	case MLX5_IB_INDEX(rp_dce_tcp_g):
218 		*attr_mask |= MLX5_IB_RP_DCE_TCP_G_ATTR;
219 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
220 			 dce_tcp_g, var);
221 		break;
222 	case MLX5_IB_INDEX(rp_dce_tcp_rtt):
223 		*attr_mask |= MLX5_IB_RP_DCE_TCP_RTT_ATTR;
224 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
225 			 dce_tcp_rtt, var);
226 		break;
227 	case MLX5_IB_INDEX(rp_rate_reduce_monitor_period):
228 		*attr_mask |= MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR;
229 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
230 			 rate_reduce_monitor_period, var);
231 		break;
232 	case MLX5_IB_INDEX(rp_initial_alpha_value):
233 		*attr_mask |= MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR;
234 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
235 			 initial_alpha_value, var);
236 		break;
237 	case MLX5_IB_INDEX(rp_gd):
238 		*attr_mask |= MLX5_IB_RP_GD_ATTR;
239 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
240 			 rpg_gd, var);
241 		break;
242 	case MLX5_IB_INDEX(np_cnp_dscp):
243 		*attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR;
244 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_dscp, var);
245 		break;
246 	case MLX5_IB_INDEX(np_cnp_prio_mode):
247 		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
248 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_prio_mode, var);
249 		break;
250 	case MLX5_IB_INDEX(np_cnp_prio):
251 		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
252 		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0);
253 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var);
254 		break;
255 	default:
256 		break;
257 	}
258 }
259 
260 static int
261 mlx5_ib_get_all_cc_params(struct mlx5_ib_dev *dev)
262 {
263 	int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out);
264 	enum mlx5_ib_cong_node_type node = 0;
265 	void *out;
266 	void *field;
267 	u32 x;
268 	int err = 0;
269 
270 	out = kzalloc(outlen, GFP_KERNEL);
271 	if (!out)
272 		return -ENOMEM;
273 
274 	/* get the current values */
275 	for (x = 0; x != MLX5_IB_CONG_PARAMS_NUM; x++) {
276 		if (node != mlx5_ib_param_to_node(x)) {
277 			node = mlx5_ib_param_to_node(x);
278 
279 			err = mlx5_cmd_query_cong_params(dev->mdev, node, out, outlen);
280 			if (err)
281 				break;
282 		}
283 		field = MLX5_ADDR_OF(query_cong_params_out, out, congestion_parameters);
284 		dev->congestion.arg[x] = mlx5_get_cc_param_val(field, x);
285 	}
286 	kfree(out);
287 	return err;
288 }
289 
290 static int
291 mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u32 index, u64 var)
292 {
293 	int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in);
294 	enum mlx5_ib_cong_node_type node;
295 	u32 attr_mask = 0;
296 	void *field;
297 	void *in;
298 	int err;
299 
300 	in = kzalloc(inlen, GFP_KERNEL);
301 	if (!in)
302 		return -ENOMEM;
303 
304 	MLX5_SET(modify_cong_params_in, in, opcode,
305 		 MLX5_CMD_OP_MODIFY_CONG_PARAMS);
306 
307 	node = mlx5_ib_param_to_node(index);
308 	MLX5_SET(modify_cong_params_in, in, cong_protocol, node);
309 
310 	field = MLX5_ADDR_OF(modify_cong_params_in, in, congestion_parameters);
311 	mlx5_ib_set_cc_param_mask_val(field, index, var, &attr_mask);
312 
313 	field = MLX5_ADDR_OF(modify_cong_params_in, in, field_select);
314 	MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp,
315 		 attr_mask);
316 
317 	err = mlx5_cmd_modify_cong_params(dev->mdev, in, inlen);
318 	kfree(in);
319 
320 	return err;
321 }
322 
323 static int
324 mlx5_ib_cong_params_handler(SYSCTL_HANDLER_ARGS)
325 {
326 	struct mlx5_ib_dev *dev = arg1;
327 	u64 value;
328 	int error;
329 
330 	CONG_LOCK(dev);
331 	value = dev->congestion.arg[arg2];
332 	if (req != NULL) {
333 		error = sysctl_handle_64(oidp, &value, 0, req);
334 		if (error || req->newptr == NULL ||
335 		    value == dev->congestion.arg[arg2])
336 			goto done;
337 
338 		/* assign new value */
339 		dev->congestion.arg[arg2] = value;
340 	} else {
341 		error = 0;
342 	}
343 	if (!MLX5_CAP_GEN(dev->mdev, cc_modify_allowed))
344 		error = EPERM;
345 	else {
346 		error = -mlx5_ib_set_cc_params(dev, MLX5_IB_INDEX(arg[arg2]),
347 		    dev->congestion.arg[arg2]);
348 	}
349 done:
350 	CONG_UNLOCK(dev);
351 
352 	return (error);
353 }
354 
355 static int
356 mlx5_ib_get_all_cc_status(struct mlx5_ib_dev *dev)
357 {
358 	const int outlen = MLX5_ST_SZ_BYTES(query_cong_status_out);
359 	uint32_t out[MLX5_ST_SZ_DW(query_cong_status_out)] = {};
360 	int error;
361 
362 #define	MLX5_IB_CONG_STATUS_READ(a,b,c,d,e,node,prio,field) do { \
363 	error = mlx5_cmd_query_cong_status(dev->mdev, node, prio, out, outlen); \
364 	if (error)							\
365 		goto done;						\
366 	dev->congestion.c = MLX5_GET(query_cong_status_out, out, field); \
367 } while (0);
368 
369 	MLX5_IB_CONG_STATUS(MLX5_IB_CONG_STATUS_READ);
370 done:
371 	return (error);
372 }
373 
374 static int
375 mlx5_ib_cong_status_handler(SYSCTL_HANDLER_ARGS)
376 {
377 	const int inlen = MLX5_ST_SZ_BYTES(modify_cong_status_in);
378 	uint32_t in[MLX5_ST_SZ_DW(modify_cong_status_in)] = {};
379 	struct mlx5_ib_dev *dev = arg1;
380 	u64 value;
381 	int error;
382 
383 	CONG_LOCK(dev);
384 	value = dev->congestion.arg[arg2];
385 	if (req != NULL) {
386 		error = sysctl_handle_64(oidp, &value, 0, req);
387 		/* convert value into a boolean */
388 		value = value ? 1 : 0;
389 		if (error || req->newptr == NULL ||
390 		    value == dev->congestion.arg[arg2])
391 			goto done;
392 
393 		/* assign new binary value */
394 		dev->congestion.arg[arg2] = value;
395 	} else {
396 		error = 0;
397 	}
398 	if (!MLX5_CAP_GEN(dev->mdev, cc_modify_allowed))
399 		error = EPERM;
400 	else switch (arg2) {
401 #define	MLX5_IB_CONG_STATUS_WRITE(a,b,c,d,e,node,prio,field)	\
402 	case MLX5_IB_INDEX(c):					\
403 		MLX5_SET(modify_cong_status_in, in, opcode,	\
404 		    MLX5_CMD_OP_MODIFY_CONG_STATUS);		\
405 		MLX5_SET(modify_cong_status_in, in, priority, prio); \
406 		MLX5_SET(modify_cong_status_in, in, cong_protocol, node); \
407 		MLX5_SET(modify_cong_status_in, in, field, value); \
408 		error = -mlx5_cmd_modify_cong_status(dev->mdev, in, inlen); \
409 		break;
410 	MLX5_IB_CONG_STATUS(MLX5_IB_CONG_STATUS_WRITE)
411 	default:
412 		error = EINVAL;
413 		break;
414 	}
415 done:
416 	CONG_UNLOCK(dev);
417 
418 	return (error);
419 }
420 
421 #define	MLX5_GET_UNALIGNED_64(t,p,f) \
422     (((u64)MLX5_GET(t,p,f##_high) << 32) | MLX5_GET(t,p,f##_low))
423 
424 static void
425 mlx5_ib_read_cong_stats(struct work_struct *work)
426 {
427 	struct mlx5_ib_dev *dev =
428 	    container_of(work, struct mlx5_ib_dev, congestion.dwork.work);
429 	const int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
430 	void *out;
431 
432 	out = kzalloc(outlen, GFP_KERNEL);
433 	if (!out)
434 		goto done;
435 
436 	CONG_LOCK(dev);
437 	if (mlx5_cmd_query_cong_counter(dev->mdev, 0, out, outlen))
438 		memset(out, 0, outlen);
439 
440 	dev->congestion.syndrome =
441 	    MLX5_GET(query_cong_statistics_out, out, syndrome);
442 	dev->congestion.rp_cur_flows =
443 	    MLX5_GET(query_cong_statistics_out, out, rp_cur_flows);
444 	dev->congestion.sum_flows =
445 	    MLX5_GET(query_cong_statistics_out, out, sum_flows);
446 	dev->congestion.rp_cnp_ignored =
447 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, rp_cnp_ignored);
448 	dev->congestion.rp_cnp_handled =
449 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, rp_cnp_handled);
450 	dev->congestion.time_stamp =
451 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, time_stamp);
452 	dev->congestion.accumulators_period =
453 	    MLX5_GET(query_cong_statistics_out, out, accumulators_period);
454 	dev->congestion.np_ecn_marked_roce_packets =
455 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, np_ecn_marked_roce_packets);
456 	dev->congestion.np_cnp_sent =
457 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, np_cnp_sent);
458 
459 	CONG_UNLOCK(dev);
460 	kfree(out);
461 
462 done:
463 	schedule_delayed_work(&dev->congestion.dwork, hz);
464 }
465 
466 void
467 mlx5_ib_cleanup_congestion(struct mlx5_ib_dev *dev)
468 {
469 
470 	while (cancel_delayed_work_sync(&dev->congestion.dwork))
471 		;
472 	sysctl_ctx_free(&dev->congestion.ctx);
473 	sx_destroy(&dev->congestion.lock);
474 }
475 
476 int
477 mlx5_ib_init_congestion(struct mlx5_ib_dev *dev)
478 {
479 	struct sysctl_ctx_list *ctx;
480 	struct sysctl_oid *parent;
481 	struct sysctl_oid *node;
482 	int err;
483 	u32 x;
484 
485 	ctx = &dev->congestion.ctx;
486 	sysctl_ctx_init(ctx);
487 	sx_init(&dev->congestion.lock, "mlx5ibcong");
488 	INIT_DELAYED_WORK(&dev->congestion.dwork, mlx5_ib_read_cong_stats);
489 
490 	if (!MLX5_CAP_GEN(dev->mdev, cc_query_allowed))
491 		return (0);
492 
493 	err = mlx5_ib_get_all_cc_params(dev);
494 	if (err)
495 		return (err);
496 
497 	err = mlx5_ib_get_all_cc_status(dev);
498 	if (err)
499 		return (err);
500 
501 	parent = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(dev->ib_dev.dev.kobj.oidp),
502 	    OID_AUTO, "cong", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
503 	    "Congestion control");
504 	if (parent == NULL)
505 		return (-ENOMEM);
506 
507 	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(parent),
508 	    OID_AUTO, "conf", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
509 	    "Configuration");
510 	if (node == NULL) {
511 		sysctl_ctx_free(&dev->congestion.ctx);
512 		return (-ENOMEM);
513 	}
514 
515 	for (x = 0; x != MLX5_IB_CONG_PARAMS_NUM; x++) {
516 		SYSCTL_ADD_PROC(ctx,
517 		    SYSCTL_CHILDREN(node), OID_AUTO,
518 		    mlx5_ib_cong_params_desc[2 * x],
519 		    CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
520 		    dev, x, &mlx5_ib_cong_params_handler, "QU",
521 		    mlx5_ib_cong_params_desc[2 * x + 1]);
522 	}
523 
524 	for (x = 0; x != MLX5_IB_CONG_STATUS_NUM; x++) {
525 		SYSCTL_ADD_PROC(ctx,
526 		    SYSCTL_CHILDREN(node), OID_AUTO,
527 		    mlx5_ib_cong_status_desc[2 * x],
528 		    CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
529 		    dev, x + MLX5_IB_CONG_PARAMS_NUM + MLX5_IB_CONG_STATS_NUM,
530 		    &mlx5_ib_cong_status_handler, "QU",
531 		    mlx5_ib_cong_status_desc[2 * x + 1]);
532 	}
533 
534 	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(parent),
535 	    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
536 	    "Statistics");
537 	if (node == NULL) {
538 		sysctl_ctx_free(&dev->congestion.ctx);
539 		return (-ENOMEM);
540 	}
541 
542 	for (x = 0; x != MLX5_IB_CONG_STATS_NUM; x++) {
543 		/* read-only SYSCTLs */
544 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO,
545 		    mlx5_ib_cong_stats_desc[2 * x],
546 		    CTLFLAG_RD | CTLFLAG_MPSAFE,
547 		    &dev->congestion.arg[x + MLX5_IB_CONG_PARAMS_NUM],
548 		    0, mlx5_ib_cong_stats_desc[2 * x + 1]);
549 	}
550 	schedule_delayed_work(&dev->congestion.dwork, hz);
551 	return (0);
552 }
553