xref: /freebsd/sys/dev/mlx5/mlx5_ib/mlx5_ib_cong.c (revision 0957b409a90fd597c1e9124cbaf3edd2b488f4ac)
1 /*-
2  * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "mlx5_ib.h"
29 
30 #include <dev/mlx5/cmd.h>
31 
32 static const char *mlx5_ib_cong_params_desc[] = {
33 	MLX5_IB_CONG_PARAMS(MLX5_IB_STATS_DESC)
34 };
35 
36 static const char *mlx5_ib_cong_stats_desc[] = {
37 	MLX5_IB_CONG_STATS(MLX5_IB_STATS_DESC)
38 };
39 
40 #define	MLX5_IB_INDEX(field) (__offsetof(struct mlx5_ib_congestion, field) / sizeof(u64))
41 #define	MLX5_IB_FLD_MAX(type, field) ((1ULL << __mlx5_bit_sz(type, field)) - 1ULL)
42 #define	MLX5_IB_SET_CLIPPED(type, ptr, field, var) do { \
43   /* rangecheck */					\
44   if ((var) > MLX5_IB_FLD_MAX(type, field))		\
45 	(var) = MLX5_IB_FLD_MAX(type, field);		\
46   /* set value */					\
47   MLX5_SET(type, ptr, field, var);			\
48 } while (0)
49 
50 #define	CONG_LOCK(dev) sx_xlock(&(dev)->congestion.lock)
51 #define	CONG_UNLOCK(dev) sx_xunlock(&(dev)->congestion.lock)
52 #define	CONG_LOCKED(dev) sx_xlocked(&(dev)->congestion.lock)
53 
54 #define	MLX5_IB_RP_CLAMP_TGT_RATE_ATTR			BIT(1)
55 #define	MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR		BIT(2)
56 #define	MLX5_IB_RP_TIME_RESET_ATTR			BIT(3)
57 #define	MLX5_IB_RP_BYTE_RESET_ATTR			BIT(4)
58 #define	MLX5_IB_RP_THRESHOLD_ATTR			BIT(5)
59 #define	MLX5_IB_RP_AI_RATE_ATTR				BIT(7)
60 #define	MLX5_IB_RP_HAI_RATE_ATTR			BIT(8)
61 #define	MLX5_IB_RP_MIN_DEC_FAC_ATTR			BIT(9)
62 #define	MLX5_IB_RP_MIN_RATE_ATTR			BIT(10)
63 #define	MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR	BIT(11)
64 #define	MLX5_IB_RP_DCE_TCP_G_ATTR			BIT(12)
65 #define	MLX5_IB_RP_DCE_TCP_RTT_ATTR			BIT(13)
66 #define	MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR	BIT(14)
67 #define	MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR		BIT(15)
68 #define	MLX5_IB_RP_GD_ATTR				BIT(16)
69 
70 #define	MLX5_IB_NP_CNP_DSCP_ATTR			BIT(3)
71 #define	MLX5_IB_NP_CNP_PRIO_MODE_ATTR			BIT(4)
72 
73 enum mlx5_ib_cong_node_type {
74 	MLX5_IB_RROCE_ECN_RP = 1,
75 	MLX5_IB_RROCE_ECN_NP = 2,
76 };
77 
78 static enum mlx5_ib_cong_node_type
79 mlx5_ib_param_to_node(u32 index)
80 {
81 
82 	if (index >= MLX5_IB_INDEX(rp_clamp_tgt_rate) &&
83 	    index <= MLX5_IB_INDEX(rp_gd))
84 		return MLX5_IB_RROCE_ECN_RP;
85 	else
86 		return MLX5_IB_RROCE_ECN_NP;
87 }
88 
89 static u64
90 mlx5_get_cc_param_val(void *field, u32 index)
91 {
92 
93 	switch (index) {
94 	case MLX5_IB_INDEX(rp_clamp_tgt_rate):
95 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
96 				clamp_tgt_rate);
97 	case MLX5_IB_INDEX(rp_clamp_tgt_rate_ati):
98 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
99 				clamp_tgt_rate_after_time_inc);
100 	case MLX5_IB_INDEX(rp_time_reset):
101 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
102 				rpg_time_reset);
103 	case MLX5_IB_INDEX(rp_byte_reset):
104 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
105 				rpg_byte_reset);
106 	case MLX5_IB_INDEX(rp_threshold):
107 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
108 				rpg_threshold);
109 	case MLX5_IB_INDEX(rp_ai_rate):
110 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
111 				rpg_ai_rate);
112 	case MLX5_IB_INDEX(rp_hai_rate):
113 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
114 				rpg_hai_rate);
115 	case MLX5_IB_INDEX(rp_min_dec_fac):
116 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
117 				rpg_min_dec_fac);
118 	case MLX5_IB_INDEX(rp_min_rate):
119 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
120 				rpg_min_rate);
121 	case MLX5_IB_INDEX(rp_rate_to_set_on_first_cnp):
122 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
123 				rate_to_set_on_first_cnp);
124 	case MLX5_IB_INDEX(rp_dce_tcp_g):
125 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
126 				dce_tcp_g);
127 	case MLX5_IB_INDEX(rp_dce_tcp_rtt):
128 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
129 				dce_tcp_rtt);
130 	case MLX5_IB_INDEX(rp_rate_reduce_monitor_period):
131 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
132 				rate_reduce_monitor_period);
133 	case MLX5_IB_INDEX(rp_initial_alpha_value):
134 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
135 				initial_alpha_value);
136 	case MLX5_IB_INDEX(rp_gd):
137 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
138 				rpg_gd);
139 	case MLX5_IB_INDEX(np_cnp_dscp):
140 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
141 				cnp_dscp);
142 	case MLX5_IB_INDEX(np_cnp_prio_mode):
143 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
144 				cnp_prio_mode);
145 	case MLX5_IB_INDEX(np_cnp_prio):
146 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
147 				cnp_802p_prio);
148 	default:
149 		return 0;
150 	}
151 }
152 
153 static void
154 mlx5_ib_set_cc_param_mask_val(void *field, u32 index,
155     u64 var, u32 *attr_mask)
156 {
157 
158 	switch (index) {
159 	case MLX5_IB_INDEX(rp_clamp_tgt_rate):
160 		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATTR;
161 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
162 			 clamp_tgt_rate, var);
163 		break;
164 	case MLX5_IB_INDEX(rp_clamp_tgt_rate_ati):
165 		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR;
166 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
167 			 clamp_tgt_rate_after_time_inc, var);
168 		break;
169 	case MLX5_IB_INDEX(rp_time_reset):
170 		*attr_mask |= MLX5_IB_RP_TIME_RESET_ATTR;
171 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
172 			 rpg_time_reset, var);
173 		break;
174 	case MLX5_IB_INDEX(rp_byte_reset):
175 		*attr_mask |= MLX5_IB_RP_BYTE_RESET_ATTR;
176 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
177 			 rpg_byte_reset, var);
178 		break;
179 	case MLX5_IB_INDEX(rp_threshold):
180 		*attr_mask |= MLX5_IB_RP_THRESHOLD_ATTR;
181 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
182 			 rpg_threshold, var);
183 		break;
184 	case MLX5_IB_INDEX(rp_ai_rate):
185 		*attr_mask |= MLX5_IB_RP_AI_RATE_ATTR;
186 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
187 			 rpg_ai_rate, var);
188 		break;
189 	case MLX5_IB_INDEX(rp_hai_rate):
190 		*attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR;
191 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
192 			 rpg_hai_rate, var);
193 		break;
194 	case MLX5_IB_INDEX(rp_min_dec_fac):
195 		*attr_mask |= MLX5_IB_RP_MIN_DEC_FAC_ATTR;
196 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
197 			 rpg_min_dec_fac, var);
198 		break;
199 	case MLX5_IB_INDEX(rp_min_rate):
200 		*attr_mask |= MLX5_IB_RP_MIN_RATE_ATTR;
201 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
202 			 rpg_min_rate, var);
203 		break;
204 	case MLX5_IB_INDEX(rp_rate_to_set_on_first_cnp):
205 		*attr_mask |= MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR;
206 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
207 			 rate_to_set_on_first_cnp, var);
208 		break;
209 	case MLX5_IB_INDEX(rp_dce_tcp_g):
210 		*attr_mask |= MLX5_IB_RP_DCE_TCP_G_ATTR;
211 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
212 			 dce_tcp_g, var);
213 		break;
214 	case MLX5_IB_INDEX(rp_dce_tcp_rtt):
215 		*attr_mask |= MLX5_IB_RP_DCE_TCP_RTT_ATTR;
216 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
217 			 dce_tcp_rtt, var);
218 		break;
219 	case MLX5_IB_INDEX(rp_rate_reduce_monitor_period):
220 		*attr_mask |= MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR;
221 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
222 			 rate_reduce_monitor_period, var);
223 		break;
224 	case MLX5_IB_INDEX(rp_initial_alpha_value):
225 		*attr_mask |= MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR;
226 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
227 			 initial_alpha_value, var);
228 		break;
229 	case MLX5_IB_INDEX(rp_gd):
230 		*attr_mask |= MLX5_IB_RP_GD_ATTR;
231 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
232 			 rpg_gd, var);
233 		break;
234 	case MLX5_IB_INDEX(np_cnp_dscp):
235 		*attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR;
236 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_dscp, var);
237 		break;
238 	case MLX5_IB_INDEX(np_cnp_prio_mode):
239 		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
240 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_prio_mode, var);
241 		break;
242 	case MLX5_IB_INDEX(np_cnp_prio):
243 		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
244 		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0);
245 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var);
246 		break;
247 	default:
248 		break;
249 	}
250 }
251 
252 static int
253 mlx5_ib_get_all_cc_params(struct mlx5_ib_dev *dev)
254 {
255 	int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out);
256 	enum mlx5_ib_cong_node_type node = 0;
257 	void *out;
258 	void *field;
259 	u32 x;
260 	int err = 0;
261 
262 	out = kzalloc(outlen, GFP_KERNEL);
263 	if (!out)
264 		return -ENOMEM;
265 
266 	/* get the current values */
267 	for (x = 0; x != MLX5_IB_CONG_PARAMS_NUM; x++) {
268 		if (node != mlx5_ib_param_to_node(x)) {
269 			node = mlx5_ib_param_to_node(x);
270 
271 			err = mlx5_cmd_query_cong_params(dev->mdev, node, out, outlen);
272 			if (err)
273 				break;
274 		}
275 		field = MLX5_ADDR_OF(query_cong_params_out, out, congestion_parameters);
276 		dev->congestion.arg[x] = mlx5_get_cc_param_val(field, x);
277 	}
278 	kfree(out);
279 	return err;
280 }
281 
282 static int
283 mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u32 index, u64 var)
284 {
285 	int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in);
286 	enum mlx5_ib_cong_node_type node;
287 	u32 attr_mask = 0;
288 	void *field;
289 	void *in;
290 	int err;
291 
292 	in = kzalloc(inlen, GFP_KERNEL);
293 	if (!in)
294 		return -ENOMEM;
295 
296 	MLX5_SET(modify_cong_params_in, in, opcode,
297 		 MLX5_CMD_OP_MODIFY_CONG_PARAMS);
298 
299 	node = mlx5_ib_param_to_node(index);
300 	MLX5_SET(modify_cong_params_in, in, cong_protocol, node);
301 
302 	field = MLX5_ADDR_OF(modify_cong_params_in, in, congestion_parameters);
303 	mlx5_ib_set_cc_param_mask_val(field, index, var, &attr_mask);
304 
305 	field = MLX5_ADDR_OF(modify_cong_params_in, in, field_select);
306 	MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp,
307 		 attr_mask);
308 
309 	err = mlx5_cmd_modify_cong_params(dev->mdev, in, inlen);
310 	kfree(in);
311 
312 	return err;
313 }
314 
315 static int
316 mlx5_ib_cong_params_handler(SYSCTL_HANDLER_ARGS)
317 {
318 	struct mlx5_ib_dev *dev = arg1;
319 	u64 value;
320 	int error;
321 
322 	CONG_LOCK(dev);
323 	value = dev->congestion.arg[arg2];
324 	if (req != NULL) {
325 		error = sysctl_handle_64(oidp, &value, 0, req);
326 		if (error || req->newptr == NULL ||
327 		    value == dev->congestion.arg[arg2])
328 			goto done;
329 
330 		/* assign new value */
331 		dev->congestion.arg[arg2] = value;
332 	} else {
333 		error = 0;
334 	}
335 	if (!MLX5_CAP_GEN(dev->mdev, cc_modify_allowed))
336 		error = EPERM;
337 	else {
338 		error = -mlx5_ib_set_cc_params(dev, MLX5_IB_INDEX(arg[arg2]),
339 		    dev->congestion.arg[arg2]);
340 	}
341 done:
342 	CONG_UNLOCK(dev);
343 
344 	return (error);
345 }
346 
347 #define	MLX5_GET_UNALIGNED_64(t,p,f) \
348     (((u64)MLX5_GET(t,p,f##_high) << 32) | MLX5_GET(t,p,f##_low))
349 
350 static void
351 mlx5_ib_read_cong_stats(struct work_struct *work)
352 {
353 	struct mlx5_ib_dev *dev =
354 	    container_of(work, struct mlx5_ib_dev, congestion.dwork.work);
355 	const int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
356 	void *out;
357 
358 	out = kzalloc(outlen, GFP_KERNEL);
359 	if (!out)
360 		goto done;
361 
362 	CONG_LOCK(dev);
363 	if (mlx5_cmd_query_cong_counter(dev->mdev, 0, out, outlen))
364 		memset(out, 0, outlen);
365 
366 	dev->congestion.syndrome =
367 	    MLX5_GET(query_cong_statistics_out, out, syndrome);
368 	dev->congestion.rp_cur_flows =
369 	    MLX5_GET(query_cong_statistics_out, out, rp_cur_flows);
370 	dev->congestion.sum_flows =
371 	    MLX5_GET(query_cong_statistics_out, out, sum_flows);
372 	dev->congestion.rp_cnp_ignored =
373 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, rp_cnp_ignored);
374 	dev->congestion.rp_cnp_handled =
375 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, rp_cnp_handled);
376 	dev->congestion.time_stamp =
377 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, time_stamp);
378 	dev->congestion.accumulators_period =
379 	    MLX5_GET(query_cong_statistics_out, out, accumulators_period);
380 	dev->congestion.np_ecn_marked_roce_packets =
381 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, np_ecn_marked_roce_packets);
382 	dev->congestion.np_cnp_sent =
383 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, np_cnp_sent);
384 
385 	CONG_UNLOCK(dev);
386 	kfree(out);
387 
388 done:
389 	schedule_delayed_work(&dev->congestion.dwork, hz);
390 }
391 
392 void
393 mlx5_ib_cleanup_congestion(struct mlx5_ib_dev *dev)
394 {
395 
396 	while (cancel_delayed_work_sync(&dev->congestion.dwork))
397 		;
398 	sysctl_ctx_free(&dev->congestion.ctx);
399 	sx_destroy(&dev->congestion.lock);
400 }
401 
402 int
403 mlx5_ib_init_congestion(struct mlx5_ib_dev *dev)
404 {
405 	struct sysctl_ctx_list *ctx;
406 	struct sysctl_oid *parent;
407 	struct sysctl_oid *node;
408 	int err;
409 	u32 x;
410 
411 	ctx = &dev->congestion.ctx;
412 	sysctl_ctx_init(ctx);
413 	sx_init(&dev->congestion.lock, "mlx5ibcong");
414 	INIT_DELAYED_WORK(&dev->congestion.dwork, mlx5_ib_read_cong_stats);
415 
416 	if (!MLX5_CAP_GEN(dev->mdev, cc_query_allowed))
417 		return (0);
418 
419 	err = mlx5_ib_get_all_cc_params(dev);
420 	if (err)
421 		return (err);
422 
423 	parent = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(dev->ib_dev.dev.kobj.oidp),
424 	    OID_AUTO, "cong", CTLFLAG_RW, NULL, "Congestion control");
425 	if (parent == NULL)
426 		return (-ENOMEM);
427 
428 	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(parent),
429 	    OID_AUTO, "conf", CTLFLAG_RW, NULL, "Configuration");
430 	if (node == NULL) {
431 		sysctl_ctx_free(&dev->congestion.ctx);
432 		return (-ENOMEM);
433 	}
434 
435 	for (x = 0; x != MLX5_IB_CONG_PARAMS_NUM; x++) {
436 		SYSCTL_ADD_PROC(ctx,
437 		    SYSCTL_CHILDREN(node), OID_AUTO,
438 		    mlx5_ib_cong_params_desc[2 * x],
439 		    CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
440 		    dev, x, &mlx5_ib_cong_params_handler, "QU",
441 		    mlx5_ib_cong_params_desc[2 * x + 1]);
442 	}
443 
444 	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(parent),
445 	    OID_AUTO, "stats", CTLFLAG_RD, NULL, "Statistics");
446 	if (node == NULL) {
447 		sysctl_ctx_free(&dev->congestion.ctx);
448 		return (-ENOMEM);
449 	}
450 
451 	for (x = 0; x != MLX5_IB_CONG_STATS_NUM; x++) {
452 		/* read-only SYSCTLs */
453 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO,
454 		    mlx5_ib_cong_stats_desc[2 * x],
455 		    CTLFLAG_RD | CTLFLAG_MPSAFE,
456 		    &dev->congestion.arg[x + MLX5_IB_CONG_PARAMS_NUM],
457 		    0, mlx5_ib_cong_stats_desc[2 * x + 1]);
458 	}
459 	schedule_delayed_work(&dev->congestion.dwork, hz);
460 	return (0);
461 }
462