/*- * Copyright (c) 2013-2020, Mellanox Technologies, Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_rss.h" #include "opt_ratelimit.h" #include #include static const char *mlx5_ib_cong_params_desc[] = { MLX5_IB_CONG_PARAMS(MLX5_IB_STATS_DESC) }; static const char *mlx5_ib_cong_status_desc[] = { MLX5_IB_CONG_STATUS(MLX5_IB_STATS_DESC) }; static const char *mlx5_ib_cong_stats_desc[] = { MLX5_IB_CONG_STATS(MLX5_IB_STATS_DESC) }; #define MLX5_IB_INDEX(field) ( \ (__offsetof(struct mlx5_ib_congestion, field) - \ __offsetof(struct mlx5_ib_congestion, arg[0])) / sizeof(u64)) #define MLX5_IB_FLD_MAX(type, field) ((1ULL << __mlx5_bit_sz(type, field)) - 1ULL) #define MLX5_IB_SET_CLIPPED(type, ptr, field, var) do { \ /* rangecheck */ \ if ((var) > MLX5_IB_FLD_MAX(type, field)) \ (var) = MLX5_IB_FLD_MAX(type, field); \ /* set value */ \ MLX5_SET(type, ptr, field, var); \ } while (0) #define CONG_LOCK(dev) sx_xlock(&(dev)->congestion.lock) #define CONG_UNLOCK(dev) sx_xunlock(&(dev)->congestion.lock) #define CONG_LOCKED(dev) sx_xlocked(&(dev)->congestion.lock) #define MLX5_IB_RP_CLAMP_TGT_RATE_ATTR BIT(1) #define MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR BIT(2) #define MLX5_IB_RP_TIME_RESET_ATTR BIT(3) #define MLX5_IB_RP_BYTE_RESET_ATTR BIT(4) #define MLX5_IB_RP_THRESHOLD_ATTR BIT(5) #define MLX5_IB_RP_AI_RATE_ATTR BIT(7) #define MLX5_IB_RP_HAI_RATE_ATTR BIT(8) #define MLX5_IB_RP_MIN_DEC_FAC_ATTR BIT(9) #define MLX5_IB_RP_MIN_RATE_ATTR BIT(10) #define MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR BIT(11) #define MLX5_IB_RP_DCE_TCP_G_ATTR BIT(12) #define MLX5_IB_RP_DCE_TCP_RTT_ATTR BIT(13) #define MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR BIT(14) #define MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR BIT(15) #define MLX5_IB_RP_GD_ATTR BIT(16) #define MLX5_IB_NP_CNP_DSCP_ATTR BIT(3) #define MLX5_IB_NP_CNP_PRIO_MODE_ATTR BIT(4) enum mlx5_ib_cong_node_type { MLX5_IB_RROCE_ECN_RP = 1, MLX5_IB_RROCE_ECN_NP = 2, }; static enum mlx5_ib_cong_node_type mlx5_ib_param_to_node(u32 index) { if (index >= MLX5_IB_INDEX(rp_clamp_tgt_rate) && index <= MLX5_IB_INDEX(rp_gd)) return MLX5_IB_RROCE_ECN_RP; else return MLX5_IB_RROCE_ECN_NP; } static u64 mlx5_get_cc_param_val(void *field, u32 index) { switch (index) { case MLX5_IB_INDEX(rp_clamp_tgt_rate): return MLX5_GET(cong_control_r_roce_ecn_rp, field, clamp_tgt_rate); case MLX5_IB_INDEX(rp_clamp_tgt_rate_ati): return MLX5_GET(cong_control_r_roce_ecn_rp, field, clamp_tgt_rate_after_time_inc); case MLX5_IB_INDEX(rp_time_reset): return MLX5_GET(cong_control_r_roce_ecn_rp, field, rpg_time_reset); case MLX5_IB_INDEX(rp_byte_reset): return MLX5_GET(cong_control_r_roce_ecn_rp, field, rpg_byte_reset); case MLX5_IB_INDEX(rp_threshold): return MLX5_GET(cong_control_r_roce_ecn_rp, field, rpg_threshold); case MLX5_IB_INDEX(rp_ai_rate): return MLX5_GET(cong_control_r_roce_ecn_rp, field, rpg_ai_rate); case MLX5_IB_INDEX(rp_hai_rate): return MLX5_GET(cong_control_r_roce_ecn_rp, field, rpg_hai_rate); case MLX5_IB_INDEX(rp_min_dec_fac): return MLX5_GET(cong_control_r_roce_ecn_rp, field, rpg_min_dec_fac); case MLX5_IB_INDEX(rp_min_rate): return MLX5_GET(cong_control_r_roce_ecn_rp, field, rpg_min_rate); case MLX5_IB_INDEX(rp_rate_to_set_on_first_cnp): return MLX5_GET(cong_control_r_roce_ecn_rp, field, rate_to_set_on_first_cnp); case MLX5_IB_INDEX(rp_dce_tcp_g): return MLX5_GET(cong_control_r_roce_ecn_rp, field, dce_tcp_g); case MLX5_IB_INDEX(rp_dce_tcp_rtt): return MLX5_GET(cong_control_r_roce_ecn_rp, field, dce_tcp_rtt); case MLX5_IB_INDEX(rp_rate_reduce_monitor_period): return MLX5_GET(cong_control_r_roce_ecn_rp, field, rate_reduce_monitor_period); case MLX5_IB_INDEX(rp_initial_alpha_value): return MLX5_GET(cong_control_r_roce_ecn_rp, field, initial_alpha_value); case MLX5_IB_INDEX(rp_gd): return MLX5_GET(cong_control_r_roce_ecn_rp, field, rpg_gd); case MLX5_IB_INDEX(np_cnp_dscp): return MLX5_GET(cong_control_r_roce_ecn_np, field, cnp_dscp); case MLX5_IB_INDEX(np_cnp_prio_mode): return MLX5_GET(cong_control_r_roce_ecn_np, field, cnp_prio_mode); case MLX5_IB_INDEX(np_cnp_prio): return MLX5_GET(cong_control_r_roce_ecn_np, field, cnp_802p_prio); default: return 0; } } static void mlx5_ib_set_cc_param_mask_val(void *field, u32 index, u64 var, u32 *attr_mask) { switch (index) { case MLX5_IB_INDEX(rp_clamp_tgt_rate): *attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, clamp_tgt_rate, var); break; case MLX5_IB_INDEX(rp_clamp_tgt_rate_ati): *attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, clamp_tgt_rate_after_time_inc, var); break; case MLX5_IB_INDEX(rp_time_reset): *attr_mask |= MLX5_IB_RP_TIME_RESET_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, rpg_time_reset, var); break; case MLX5_IB_INDEX(rp_byte_reset): *attr_mask |= MLX5_IB_RP_BYTE_RESET_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, rpg_byte_reset, var); break; case MLX5_IB_INDEX(rp_threshold): *attr_mask |= MLX5_IB_RP_THRESHOLD_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, rpg_threshold, var); break; case MLX5_IB_INDEX(rp_ai_rate): *attr_mask |= MLX5_IB_RP_AI_RATE_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, rpg_ai_rate, var); break; case MLX5_IB_INDEX(rp_hai_rate): *attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, rpg_hai_rate, var); break; case MLX5_IB_INDEX(rp_min_dec_fac): *attr_mask |= MLX5_IB_RP_MIN_DEC_FAC_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, rpg_min_dec_fac, var); break; case MLX5_IB_INDEX(rp_min_rate): *attr_mask |= MLX5_IB_RP_MIN_RATE_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, rpg_min_rate, var); break; case MLX5_IB_INDEX(rp_rate_to_set_on_first_cnp): *attr_mask |= MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, rate_to_set_on_first_cnp, var); break; case MLX5_IB_INDEX(rp_dce_tcp_g): *attr_mask |= MLX5_IB_RP_DCE_TCP_G_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, dce_tcp_g, var); break; case MLX5_IB_INDEX(rp_dce_tcp_rtt): *attr_mask |= MLX5_IB_RP_DCE_TCP_RTT_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, dce_tcp_rtt, var); break; case MLX5_IB_INDEX(rp_rate_reduce_monitor_period): *attr_mask |= MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, rate_reduce_monitor_period, var); break; case MLX5_IB_INDEX(rp_initial_alpha_value): *attr_mask |= MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, initial_alpha_value, var); break; case MLX5_IB_INDEX(rp_gd): *attr_mask |= MLX5_IB_RP_GD_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field, rpg_gd, var); break; case MLX5_IB_INDEX(np_cnp_dscp): *attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_dscp, var); break; case MLX5_IB_INDEX(np_cnp_prio_mode): *attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR; MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_prio_mode, var); break; case MLX5_IB_INDEX(np_cnp_prio): *attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR; MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0); MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var); break; default: break; } } static int mlx5_ib_get_all_cc_params(struct mlx5_ib_dev *dev) { int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out); enum mlx5_ib_cong_node_type node = 0; void *out; void *field; u32 x; int err = 0; out = kzalloc(outlen, GFP_KERNEL); if (!out) return -ENOMEM; /* get the current values */ for (x = 0; x != MLX5_IB_CONG_PARAMS_NUM; x++) { if (node != mlx5_ib_param_to_node(x)) { node = mlx5_ib_param_to_node(x); err = mlx5_cmd_query_cong_params(dev->mdev, node, out, outlen); if (err) break; } field = MLX5_ADDR_OF(query_cong_params_out, out, congestion_parameters); dev->congestion.arg[x] = mlx5_get_cc_param_val(field, x); } kfree(out); return err; } static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u32 index, u64 var) { int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in); enum mlx5_ib_cong_node_type node; u32 attr_mask = 0; void *field; void *in; int err; in = kzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; MLX5_SET(modify_cong_params_in, in, opcode, MLX5_CMD_OP_MODIFY_CONG_PARAMS); node = mlx5_ib_param_to_node(index); MLX5_SET(modify_cong_params_in, in, cong_protocol, node); field = MLX5_ADDR_OF(modify_cong_params_in, in, congestion_parameters); mlx5_ib_set_cc_param_mask_val(field, index, var, &attr_mask); field = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp, attr_mask); err = mlx5_cmd_modify_cong_params(dev->mdev, in, inlen); kfree(in); return err; } static int mlx5_ib_cong_params_handler(SYSCTL_HANDLER_ARGS) { struct mlx5_ib_dev *dev = arg1; u64 value; int error; CONG_LOCK(dev); value = dev->congestion.arg[arg2]; if (req != NULL) { error = sysctl_handle_64(oidp, &value, 0, req); if (error || req->newptr == NULL || value == dev->congestion.arg[arg2]) goto done; /* assign new value */ dev->congestion.arg[arg2] = value; } else { error = 0; } if (!MLX5_CAP_GEN(dev->mdev, cc_modify_allowed)) error = EPERM; else { error = -mlx5_ib_set_cc_params(dev, MLX5_IB_INDEX(arg[arg2]), dev->congestion.arg[arg2]); } done: CONG_UNLOCK(dev); return (error); } static int mlx5_ib_get_all_cc_status(struct mlx5_ib_dev *dev) { const int outlen = MLX5_ST_SZ_BYTES(query_cong_status_out); uint32_t out[MLX5_ST_SZ_DW(query_cong_status_out)] = {}; int error; #define MLX5_IB_CONG_STATUS_READ(a,b,c,d,e,node,prio,field) do { \ error = mlx5_cmd_query_cong_status(dev->mdev, node, prio, out, outlen); \ if (error) \ goto done; \ dev->congestion.c = MLX5_GET(query_cong_status_out, out, field); \ } while (0); MLX5_IB_CONG_STATUS(MLX5_IB_CONG_STATUS_READ); done: return (error); } static int mlx5_ib_cong_status_handler(SYSCTL_HANDLER_ARGS) { const int inlen = MLX5_ST_SZ_BYTES(modify_cong_status_in); uint32_t in[MLX5_ST_SZ_DW(modify_cong_status_in)] = {}; struct mlx5_ib_dev *dev = arg1; u64 value; int error; CONG_LOCK(dev); value = dev->congestion.arg[arg2]; if (req != NULL) { error = sysctl_handle_64(oidp, &value, 0, req); /* convert value into a boolean */ value = value ? 1 : 0; if (error || req->newptr == NULL || value == dev->congestion.arg[arg2]) goto done; /* assign new binary value */ dev->congestion.arg[arg2] = value; } else { error = 0; } if (!MLX5_CAP_GEN(dev->mdev, cc_modify_allowed)) error = EPERM; else switch (arg2) { #define MLX5_IB_CONG_STATUS_WRITE(a,b,c,d,e,node,prio,field) \ case MLX5_IB_INDEX(c): \ MLX5_SET(modify_cong_status_in, in, opcode, \ MLX5_CMD_OP_MODIFY_CONG_STATUS); \ MLX5_SET(modify_cong_status_in, in, priority, prio); \ MLX5_SET(modify_cong_status_in, in, cong_protocol, node); \ MLX5_SET(modify_cong_status_in, in, field, value); \ error = -mlx5_cmd_modify_cong_status(dev->mdev, in, inlen); \ break; MLX5_IB_CONG_STATUS(MLX5_IB_CONG_STATUS_WRITE) default: error = EINVAL; break; } done: CONG_UNLOCK(dev); return (error); } #define MLX5_GET_UNALIGNED_64(t,p,f) \ (((u64)MLX5_GET(t,p,f##_high) << 32) | MLX5_GET(t,p,f##_low)) static void mlx5_ib_read_cong_stats(struct work_struct *work) { struct mlx5_ib_dev *dev = container_of(work, struct mlx5_ib_dev, congestion.dwork.work); const int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out); void *out; out = kzalloc(outlen, GFP_KERNEL); if (!out) goto done; CONG_LOCK(dev); if (mlx5_cmd_query_cong_counter(dev->mdev, 0, out, outlen)) memset(out, 0, outlen); dev->congestion.syndrome = MLX5_GET(query_cong_statistics_out, out, syndrome); dev->congestion.rp_cur_flows = MLX5_GET(query_cong_statistics_out, out, rp_cur_flows); dev->congestion.sum_flows = MLX5_GET(query_cong_statistics_out, out, sum_flows); dev->congestion.rp_cnp_ignored = MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, rp_cnp_ignored); dev->congestion.rp_cnp_handled = MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, rp_cnp_handled); dev->congestion.time_stamp = MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, time_stamp); dev->congestion.accumulators_period = MLX5_GET(query_cong_statistics_out, out, accumulators_period); dev->congestion.np_ecn_marked_roce_packets = MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, np_ecn_marked_roce_packets); dev->congestion.np_cnp_sent = MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, np_cnp_sent); CONG_UNLOCK(dev); kfree(out); done: schedule_delayed_work(&dev->congestion.dwork, hz); } void mlx5_ib_cleanup_congestion(struct mlx5_ib_dev *dev) { while (cancel_delayed_work_sync(&dev->congestion.dwork)) ; sysctl_ctx_free(&dev->congestion.ctx); sx_destroy(&dev->congestion.lock); } int mlx5_ib_init_congestion(struct mlx5_ib_dev *dev) { struct sysctl_ctx_list *ctx; struct sysctl_oid *parent; struct sysctl_oid *node; int err; u32 x; ctx = &dev->congestion.ctx; sysctl_ctx_init(ctx); sx_init(&dev->congestion.lock, "mlx5ibcong"); INIT_DELAYED_WORK(&dev->congestion.dwork, mlx5_ib_read_cong_stats); if (!MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) return (0); err = mlx5_ib_get_all_cc_params(dev); if (err) return (err); err = mlx5_ib_get_all_cc_status(dev); if (err) return (err); parent = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(dev->ib_dev.dev.kobj.oidp), OID_AUTO, "cong", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Congestion control"); if (parent == NULL) return (-ENOMEM); node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(parent), OID_AUTO, "conf", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Configuration"); if (node == NULL) { sysctl_ctx_free(&dev->congestion.ctx); return (-ENOMEM); } for (x = 0; x != MLX5_IB_CONG_PARAMS_NUM; x++) { SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(node), OID_AUTO, mlx5_ib_cong_params_desc[2 * x], CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, dev, x, &mlx5_ib_cong_params_handler, "QU", mlx5_ib_cong_params_desc[2 * x + 1]); } for (x = 0; x != MLX5_IB_CONG_STATUS_NUM; x++) { SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(node), OID_AUTO, mlx5_ib_cong_status_desc[2 * x], CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, dev, x + MLX5_IB_CONG_PARAMS_NUM + MLX5_IB_CONG_STATS_NUM, &mlx5_ib_cong_status_handler, "QU", mlx5_ib_cong_status_desc[2 * x + 1]); } node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(parent), OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Statistics"); if (node == NULL) { sysctl_ctx_free(&dev->congestion.ctx); return (-ENOMEM); } for (x = 0; x != MLX5_IB_CONG_STATS_NUM; x++) { /* read-only SYSCTLs */ SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, mlx5_ib_cong_stats_desc[2 * x], CTLFLAG_RD | CTLFLAG_MPSAFE, &dev->congestion.arg[x + MLX5_IB_CONG_PARAMS_NUM], 0, mlx5_ib_cong_stats_desc[2 * x + 1]); } schedule_delayed_work(&dev->congestion.dwork, hz); return (0); }