xref: /freebsd/sys/dev/mlx5/mlx5_ib/mlx5_ib_cong.c (revision b4af4f93c682e445bf159f0d1ec90b636296c946)
1 /*-
2  * Copyright (c) 2013-2020, Mellanox Technologies, Ltd.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "mlx5_ib.h"
29 
30 #include <dev/mlx5/cmd.h>
31 
32 static const char *mlx5_ib_cong_params_desc[] = {
33 	MLX5_IB_CONG_PARAMS(MLX5_IB_STATS_DESC)
34 };
35 
36 static const char *mlx5_ib_cong_stats_desc[] = {
37 	MLX5_IB_CONG_STATS(MLX5_IB_STATS_DESC)
38 };
39 
40 #define	MLX5_IB_INDEX(field) ( \
41     (__offsetof(struct mlx5_ib_congestion, field) - \
42      __offsetof(struct mlx5_ib_congestion, arg[0])) / sizeof(u64))
43 #define	MLX5_IB_FLD_MAX(type, field) ((1ULL << __mlx5_bit_sz(type, field)) - 1ULL)
44 #define	MLX5_IB_SET_CLIPPED(type, ptr, field, var) do { \
45   /* rangecheck */					\
46   if ((var) > MLX5_IB_FLD_MAX(type, field))		\
47 	(var) = MLX5_IB_FLD_MAX(type, field);		\
48   /* set value */					\
49   MLX5_SET(type, ptr, field, var);			\
50 } while (0)
51 
52 #define	CONG_LOCK(dev) sx_xlock(&(dev)->congestion.lock)
53 #define	CONG_UNLOCK(dev) sx_xunlock(&(dev)->congestion.lock)
54 #define	CONG_LOCKED(dev) sx_xlocked(&(dev)->congestion.lock)
55 
56 #define	MLX5_IB_RP_CLAMP_TGT_RATE_ATTR			BIT(1)
57 #define	MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR		BIT(2)
58 #define	MLX5_IB_RP_TIME_RESET_ATTR			BIT(3)
59 #define	MLX5_IB_RP_BYTE_RESET_ATTR			BIT(4)
60 #define	MLX5_IB_RP_THRESHOLD_ATTR			BIT(5)
61 #define	MLX5_IB_RP_AI_RATE_ATTR				BIT(7)
62 #define	MLX5_IB_RP_HAI_RATE_ATTR			BIT(8)
63 #define	MLX5_IB_RP_MIN_DEC_FAC_ATTR			BIT(9)
64 #define	MLX5_IB_RP_MIN_RATE_ATTR			BIT(10)
65 #define	MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR	BIT(11)
66 #define	MLX5_IB_RP_DCE_TCP_G_ATTR			BIT(12)
67 #define	MLX5_IB_RP_DCE_TCP_RTT_ATTR			BIT(13)
68 #define	MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR	BIT(14)
69 #define	MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR		BIT(15)
70 #define	MLX5_IB_RP_GD_ATTR				BIT(16)
71 
72 #define	MLX5_IB_NP_CNP_DSCP_ATTR			BIT(3)
73 #define	MLX5_IB_NP_CNP_PRIO_MODE_ATTR			BIT(4)
74 
75 enum mlx5_ib_cong_node_type {
76 	MLX5_IB_RROCE_ECN_RP = 1,
77 	MLX5_IB_RROCE_ECN_NP = 2,
78 };
79 
80 static enum mlx5_ib_cong_node_type
81 mlx5_ib_param_to_node(u32 index)
82 {
83 
84 	if (index >= MLX5_IB_INDEX(rp_clamp_tgt_rate) &&
85 	    index <= MLX5_IB_INDEX(rp_gd))
86 		return MLX5_IB_RROCE_ECN_RP;
87 	else
88 		return MLX5_IB_RROCE_ECN_NP;
89 }
90 
91 static u64
92 mlx5_get_cc_param_val(void *field, u32 index)
93 {
94 
95 	switch (index) {
96 	case MLX5_IB_INDEX(rp_clamp_tgt_rate):
97 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
98 				clamp_tgt_rate);
99 	case MLX5_IB_INDEX(rp_clamp_tgt_rate_ati):
100 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
101 				clamp_tgt_rate_after_time_inc);
102 	case MLX5_IB_INDEX(rp_time_reset):
103 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
104 				rpg_time_reset);
105 	case MLX5_IB_INDEX(rp_byte_reset):
106 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
107 				rpg_byte_reset);
108 	case MLX5_IB_INDEX(rp_threshold):
109 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
110 				rpg_threshold);
111 	case MLX5_IB_INDEX(rp_ai_rate):
112 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
113 				rpg_ai_rate);
114 	case MLX5_IB_INDEX(rp_hai_rate):
115 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
116 				rpg_hai_rate);
117 	case MLX5_IB_INDEX(rp_min_dec_fac):
118 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
119 				rpg_min_dec_fac);
120 	case MLX5_IB_INDEX(rp_min_rate):
121 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
122 				rpg_min_rate);
123 	case MLX5_IB_INDEX(rp_rate_to_set_on_first_cnp):
124 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
125 				rate_to_set_on_first_cnp);
126 	case MLX5_IB_INDEX(rp_dce_tcp_g):
127 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
128 				dce_tcp_g);
129 	case MLX5_IB_INDEX(rp_dce_tcp_rtt):
130 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
131 				dce_tcp_rtt);
132 	case MLX5_IB_INDEX(rp_rate_reduce_monitor_period):
133 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
134 				rate_reduce_monitor_period);
135 	case MLX5_IB_INDEX(rp_initial_alpha_value):
136 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
137 				initial_alpha_value);
138 	case MLX5_IB_INDEX(rp_gd):
139 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
140 				rpg_gd);
141 	case MLX5_IB_INDEX(np_cnp_dscp):
142 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
143 				cnp_dscp);
144 	case MLX5_IB_INDEX(np_cnp_prio_mode):
145 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
146 				cnp_prio_mode);
147 	case MLX5_IB_INDEX(np_cnp_prio):
148 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
149 				cnp_802p_prio);
150 	default:
151 		return 0;
152 	}
153 }
154 
155 static void
156 mlx5_ib_set_cc_param_mask_val(void *field, u32 index,
157     u64 var, u32 *attr_mask)
158 {
159 
160 	switch (index) {
161 	case MLX5_IB_INDEX(rp_clamp_tgt_rate):
162 		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATTR;
163 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
164 			 clamp_tgt_rate, var);
165 		break;
166 	case MLX5_IB_INDEX(rp_clamp_tgt_rate_ati):
167 		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR;
168 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
169 			 clamp_tgt_rate_after_time_inc, var);
170 		break;
171 	case MLX5_IB_INDEX(rp_time_reset):
172 		*attr_mask |= MLX5_IB_RP_TIME_RESET_ATTR;
173 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
174 			 rpg_time_reset, var);
175 		break;
176 	case MLX5_IB_INDEX(rp_byte_reset):
177 		*attr_mask |= MLX5_IB_RP_BYTE_RESET_ATTR;
178 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
179 			 rpg_byte_reset, var);
180 		break;
181 	case MLX5_IB_INDEX(rp_threshold):
182 		*attr_mask |= MLX5_IB_RP_THRESHOLD_ATTR;
183 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
184 			 rpg_threshold, var);
185 		break;
186 	case MLX5_IB_INDEX(rp_ai_rate):
187 		*attr_mask |= MLX5_IB_RP_AI_RATE_ATTR;
188 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
189 			 rpg_ai_rate, var);
190 		break;
191 	case MLX5_IB_INDEX(rp_hai_rate):
192 		*attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR;
193 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
194 			 rpg_hai_rate, var);
195 		break;
196 	case MLX5_IB_INDEX(rp_min_dec_fac):
197 		*attr_mask |= MLX5_IB_RP_MIN_DEC_FAC_ATTR;
198 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
199 			 rpg_min_dec_fac, var);
200 		break;
201 	case MLX5_IB_INDEX(rp_min_rate):
202 		*attr_mask |= MLX5_IB_RP_MIN_RATE_ATTR;
203 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
204 			 rpg_min_rate, var);
205 		break;
206 	case MLX5_IB_INDEX(rp_rate_to_set_on_first_cnp):
207 		*attr_mask |= MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR;
208 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
209 			 rate_to_set_on_first_cnp, var);
210 		break;
211 	case MLX5_IB_INDEX(rp_dce_tcp_g):
212 		*attr_mask |= MLX5_IB_RP_DCE_TCP_G_ATTR;
213 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
214 			 dce_tcp_g, var);
215 		break;
216 	case MLX5_IB_INDEX(rp_dce_tcp_rtt):
217 		*attr_mask |= MLX5_IB_RP_DCE_TCP_RTT_ATTR;
218 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
219 			 dce_tcp_rtt, var);
220 		break;
221 	case MLX5_IB_INDEX(rp_rate_reduce_monitor_period):
222 		*attr_mask |= MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR;
223 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
224 			 rate_reduce_monitor_period, var);
225 		break;
226 	case MLX5_IB_INDEX(rp_initial_alpha_value):
227 		*attr_mask |= MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR;
228 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
229 			 initial_alpha_value, var);
230 		break;
231 	case MLX5_IB_INDEX(rp_gd):
232 		*attr_mask |= MLX5_IB_RP_GD_ATTR;
233 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
234 			 rpg_gd, var);
235 		break;
236 	case MLX5_IB_INDEX(np_cnp_dscp):
237 		*attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR;
238 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_dscp, var);
239 		break;
240 	case MLX5_IB_INDEX(np_cnp_prio_mode):
241 		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
242 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_prio_mode, var);
243 		break;
244 	case MLX5_IB_INDEX(np_cnp_prio):
245 		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
246 		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0);
247 		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var);
248 		break;
249 	default:
250 		break;
251 	}
252 }
253 
254 static int
255 mlx5_ib_get_all_cc_params(struct mlx5_ib_dev *dev)
256 {
257 	int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out);
258 	enum mlx5_ib_cong_node_type node = 0;
259 	void *out;
260 	void *field;
261 	u32 x;
262 	int err = 0;
263 
264 	out = kzalloc(outlen, GFP_KERNEL);
265 	if (!out)
266 		return -ENOMEM;
267 
268 	/* get the current values */
269 	for (x = 0; x != MLX5_IB_CONG_PARAMS_NUM; x++) {
270 		if (node != mlx5_ib_param_to_node(x)) {
271 			node = mlx5_ib_param_to_node(x);
272 
273 			err = mlx5_cmd_query_cong_params(dev->mdev, node, out, outlen);
274 			if (err)
275 				break;
276 		}
277 		field = MLX5_ADDR_OF(query_cong_params_out, out, congestion_parameters);
278 		dev->congestion.arg[x] = mlx5_get_cc_param_val(field, x);
279 	}
280 	kfree(out);
281 	return err;
282 }
283 
284 static int
285 mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u32 index, u64 var)
286 {
287 	int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in);
288 	enum mlx5_ib_cong_node_type node;
289 	u32 attr_mask = 0;
290 	void *field;
291 	void *in;
292 	int err;
293 
294 	in = kzalloc(inlen, GFP_KERNEL);
295 	if (!in)
296 		return -ENOMEM;
297 
298 	MLX5_SET(modify_cong_params_in, in, opcode,
299 		 MLX5_CMD_OP_MODIFY_CONG_PARAMS);
300 
301 	node = mlx5_ib_param_to_node(index);
302 	MLX5_SET(modify_cong_params_in, in, cong_protocol, node);
303 
304 	field = MLX5_ADDR_OF(modify_cong_params_in, in, congestion_parameters);
305 	mlx5_ib_set_cc_param_mask_val(field, index, var, &attr_mask);
306 
307 	field = MLX5_ADDR_OF(modify_cong_params_in, in, field_select);
308 	MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp,
309 		 attr_mask);
310 
311 	err = mlx5_cmd_modify_cong_params(dev->mdev, in, inlen);
312 	kfree(in);
313 
314 	return err;
315 }
316 
317 static int
318 mlx5_ib_cong_params_handler(SYSCTL_HANDLER_ARGS)
319 {
320 	struct mlx5_ib_dev *dev = arg1;
321 	u64 value;
322 	int error;
323 
324 	CONG_LOCK(dev);
325 	value = dev->congestion.arg[arg2];
326 	if (req != NULL) {
327 		error = sysctl_handle_64(oidp, &value, 0, req);
328 		if (error || req->newptr == NULL ||
329 		    value == dev->congestion.arg[arg2])
330 			goto done;
331 
332 		/* assign new value */
333 		dev->congestion.arg[arg2] = value;
334 	} else {
335 		error = 0;
336 	}
337 	if (!MLX5_CAP_GEN(dev->mdev, cc_modify_allowed))
338 		error = EPERM;
339 	else {
340 		error = -mlx5_ib_set_cc_params(dev, MLX5_IB_INDEX(arg[arg2]),
341 		    dev->congestion.arg[arg2]);
342 	}
343 done:
344 	CONG_UNLOCK(dev);
345 
346 	return (error);
347 }
348 
349 #define	MLX5_GET_UNALIGNED_64(t,p,f) \
350     (((u64)MLX5_GET(t,p,f##_high) << 32) | MLX5_GET(t,p,f##_low))
351 
352 static void
353 mlx5_ib_read_cong_stats(struct work_struct *work)
354 {
355 	struct mlx5_ib_dev *dev =
356 	    container_of(work, struct mlx5_ib_dev, congestion.dwork.work);
357 	const int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
358 	void *out;
359 
360 	out = kzalloc(outlen, GFP_KERNEL);
361 	if (!out)
362 		goto done;
363 
364 	CONG_LOCK(dev);
365 	if (mlx5_cmd_query_cong_counter(dev->mdev, 0, out, outlen))
366 		memset(out, 0, outlen);
367 
368 	dev->congestion.syndrome =
369 	    MLX5_GET(query_cong_statistics_out, out, syndrome);
370 	dev->congestion.rp_cur_flows =
371 	    MLX5_GET(query_cong_statistics_out, out, rp_cur_flows);
372 	dev->congestion.sum_flows =
373 	    MLX5_GET(query_cong_statistics_out, out, sum_flows);
374 	dev->congestion.rp_cnp_ignored =
375 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, rp_cnp_ignored);
376 	dev->congestion.rp_cnp_handled =
377 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, rp_cnp_handled);
378 	dev->congestion.time_stamp =
379 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, time_stamp);
380 	dev->congestion.accumulators_period =
381 	    MLX5_GET(query_cong_statistics_out, out, accumulators_period);
382 	dev->congestion.np_ecn_marked_roce_packets =
383 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, np_ecn_marked_roce_packets);
384 	dev->congestion.np_cnp_sent =
385 	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, np_cnp_sent);
386 
387 	CONG_UNLOCK(dev);
388 	kfree(out);
389 
390 done:
391 	schedule_delayed_work(&dev->congestion.dwork, hz);
392 }
393 
394 void
395 mlx5_ib_cleanup_congestion(struct mlx5_ib_dev *dev)
396 {
397 
398 	while (cancel_delayed_work_sync(&dev->congestion.dwork))
399 		;
400 	sysctl_ctx_free(&dev->congestion.ctx);
401 	sx_destroy(&dev->congestion.lock);
402 }
403 
404 int
405 mlx5_ib_init_congestion(struct mlx5_ib_dev *dev)
406 {
407 	struct sysctl_ctx_list *ctx;
408 	struct sysctl_oid *parent;
409 	struct sysctl_oid *node;
410 	int err;
411 	u32 x;
412 
413 	ctx = &dev->congestion.ctx;
414 	sysctl_ctx_init(ctx);
415 	sx_init(&dev->congestion.lock, "mlx5ibcong");
416 	INIT_DELAYED_WORK(&dev->congestion.dwork, mlx5_ib_read_cong_stats);
417 
418 	if (!MLX5_CAP_GEN(dev->mdev, cc_query_allowed))
419 		return (0);
420 
421 	err = mlx5_ib_get_all_cc_params(dev);
422 	if (err)
423 		return (err);
424 
425 	parent = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(dev->ib_dev.dev.kobj.oidp),
426 	    OID_AUTO, "cong", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
427 	    "Congestion control");
428 	if (parent == NULL)
429 		return (-ENOMEM);
430 
431 	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(parent),
432 	    OID_AUTO, "conf", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
433 	    "Configuration");
434 	if (node == NULL) {
435 		sysctl_ctx_free(&dev->congestion.ctx);
436 		return (-ENOMEM);
437 	}
438 
439 	for (x = 0; x != MLX5_IB_CONG_PARAMS_NUM; x++) {
440 		SYSCTL_ADD_PROC(ctx,
441 		    SYSCTL_CHILDREN(node), OID_AUTO,
442 		    mlx5_ib_cong_params_desc[2 * x],
443 		    CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
444 		    dev, x, &mlx5_ib_cong_params_handler, "QU",
445 		    mlx5_ib_cong_params_desc[2 * x + 1]);
446 	}
447 
448 	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(parent),
449 	    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
450 	    "Statistics");
451 	if (node == NULL) {
452 		sysctl_ctx_free(&dev->congestion.ctx);
453 		return (-ENOMEM);
454 	}
455 
456 	for (x = 0; x != MLX5_IB_CONG_STATS_NUM; x++) {
457 		/* read-only SYSCTLs */
458 		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO,
459 		    mlx5_ib_cong_stats_desc[2 * x],
460 		    CTLFLAG_RD | CTLFLAG_MPSAFE,
461 		    &dev->congestion.arg[x + MLX5_IB_CONG_PARAMS_NUM],
462 		    0, mlx5_ib_cong_stats_desc[2 * x + 1]);
463 	}
464 	schedule_delayed_work(&dev->congestion.dwork, hz);
465 	return (0);
466 }
467