1#!/bin/bash 2# SPDX-License-Identifier: GPL-2.0 3# 4# A test for switch behavior under MC overload. An issue in Spectrum chips 5# causes throughput of UC traffic to drop severely when a switch is under heavy 6# MC load. This issue can be overcome by putting the switch to MC-aware mode. 7# This test verifies that UC performance stays intact even as the switch is 8# under MC flood, and therefore that the MC-aware mode is enabled and correctly 9# configured. 10# 11# Because mlxsw throttles CPU port, the traffic can't actually reach userspace 12# at full speed. That makes it impossible to use iperf3 to simply measure the 13# throughput, because many packets (that reach $h3) don't get to the kernel at 14# all even in UDP mode (the situation is even worse in TCP mode, where one can't 15# hope to see more than a couple Mbps). 16# 17# So instead we send traffic with mausezahn and use RX ethtool counters at $h3. 18# Multicast traffic is untagged, unicast traffic is tagged with PCP 1. Therefore 19# each gets a different priority and we can use per-prio ethtool counters to 20# measure the throughput. In order to avoid prioritizing unicast traffic, prio 21# qdisc is installed on $swp3 and maps all priorities to the same band #7 (and 22# thus TC 0). 23# 24# Mausezahn can't actually saturate the links unless it's using large frames. 25# Thus we set MTU to 10K on all involved interfaces. Then both unicast and 26# multicast traffic uses 8K frames. 27# 28# +---------------------------+ +----------------------------------+ 29# | H1 | | H2 | 30# | | | unicast --> + $h2.111 | 31# | multicast | | traffic | 192.0.2.129/28 | 32# | traffic | | | e-qos-map 0:1 | 33# | $h1 + <----- | | | | 34# | 192.0.2.65/28 | | | + $h2 | 35# +---------------|-----------+ +--------------|-------------------+ 36# | | 37# +---------------|---------------------------------------|-------------------+ 38# | $swp1 + + $swp2 | 39# | >1Gbps | | >1Gbps | 40# | +-------------|------+ +----------|----------------+ | 41# | | $swp1.1 + | | + $swp2.111 | | 42# | | BR1 | SW | BR111 | | 43# | | $swp3.1 + | | + $swp3.111 | | 44# | +-------------|------+ +----------|----------------+ | 45# | \_______________________________________/ | 46# | | | 47# | + $swp3 | 48# | | 1Gbps bottleneck | 49# | | prio qdisc: {0..7} -> 7 | 50# +------------------------------------|--------------------------------------+ 51# | 52# +--|-----------------+ 53# | + $h3 H3 | 54# | | 192.0.2.66/28 | 55# | | | 56# | + $h3.111 | 57# | 192.0.2.130/28 | 58# +--------------------+ 59 60ALL_TESTS=" 61 ping_ipv4 62 test_mc_aware 63 test_uc_aware 64" 65 66lib_dir=$(dirname $0)/../../../net/forwarding 67 68NUM_NETIFS=6 69source $lib_dir/lib.sh 70source $lib_dir/devlink_lib.sh 71source qos_lib.sh 72 73h1_create() 74{ 75 simple_if_init $h1 192.0.2.65/28 76 defer simple_if_fini $h1 192.0.2.65/28 77 78 mtu_set $h1 10000 79 defer mtu_restore $h1 80} 81 82h2_create() 83{ 84 simple_if_init $h2 85 defer simple_if_fini $h2 86 87 mtu_set $h2 10000 88 defer mtu_restore $h2 89 90 vlan_create $h2 111 v$h2 192.0.2.129/28 91 defer vlan_destroy $h2 111 92 ip link set dev $h2.111 type vlan egress-qos-map 0:1 93} 94 95h3_create() 96{ 97 simple_if_init $h3 192.0.2.66/28 98 defer simple_if_fini $h3 192.0.2.66/28 99 100 mtu_set $h3 10000 101 defer mtu_restore $h3 102 103 vlan_create $h3 111 v$h3 192.0.2.130/28 104 defer vlan_destroy $h3 111 105} 106 107switch_create() 108{ 109 ip link set dev $swp1 up 110 defer ip link set dev $swp1 down 111 112 mtu_set $swp1 10000 113 defer mtu_restore $swp1 114 115 ip link set dev $swp2 up 116 defer ip link set dev $swp2 down 117 118 mtu_set $swp2 10000 119 defer mtu_restore $swp2 120 121 ip link set dev $swp3 up 122 defer ip link set dev $swp3 down 123 124 mtu_set $swp3 10000 125 defer mtu_restore $swp3 126 127 vlan_create $swp2 111 128 defer vlan_destroy $swp2 111 129 130 vlan_create $swp3 111 131 defer vlan_destroy $swp3 111 132 133 tc qdisc replace dev $swp3 root handle 3: tbf rate 1gbit \ 134 burst 128K limit 1G 135 defer tc qdisc del dev $swp3 root handle 3: 136 137 tc qdisc replace dev $swp3 parent 3:3 handle 33: \ 138 prio bands 8 priomap 7 7 7 7 7 7 7 7 139 defer tc qdisc del dev $swp3 parent 3:3 handle 33: 140 141 ip link add name br1 type bridge vlan_filtering 0 142 defer ip link del dev br1 143 ip link set dev br1 addrgenmode none 144 ip link set dev br1 up 145 146 ip link set dev $swp1 master br1 147 defer ip link set dev $swp1 nomaster 148 149 ip link set dev $swp3 master br1 150 defer ip link set dev $swp3 nomaster 151 152 ip link add name br111 type bridge vlan_filtering 0 153 defer ip link del dev br111 154 ip link set dev br111 addrgenmode none 155 ip link set dev br111 up 156 157 ip link set dev $swp2.111 master br111 158 defer ip link set dev $swp2.111 nomaster 159 160 ip link set dev $swp3.111 master br111 161 defer ip link set dev $swp3.111 nomaster 162 163 # Make sure that ingress quotas are smaller than egress so that there is 164 # room for both streams of traffic to be admitted to shared buffer. 165 devlink_port_pool_th_save $swp1 0 166 devlink_port_pool_th_set $swp1 0 5 167 defer devlink_port_pool_th_restore $swp1 0 168 169 devlink_tc_bind_pool_th_save $swp1 0 ingress 170 devlink_tc_bind_pool_th_set $swp1 0 ingress 0 5 171 defer devlink_tc_bind_pool_th_restore $swp1 0 ingress 172 173 devlink_port_pool_th_save $swp2 0 174 devlink_port_pool_th_set $swp2 0 5 175 defer devlink_port_pool_th_restore $swp2 0 176 177 devlink_tc_bind_pool_th_save $swp2 1 ingress 178 devlink_tc_bind_pool_th_set $swp2 1 ingress 0 5 179 defer devlink_tc_bind_pool_th_restore $swp2 1 ingress 180 181 devlink_port_pool_th_save $swp3 4 182 devlink_port_pool_th_set $swp3 4 12 183 defer devlink_port_pool_th_restore $swp3 4 184} 185 186setup_prepare() 187{ 188 h1=${NETIFS[p1]} 189 swp1=${NETIFS[p2]} 190 191 swp2=${NETIFS[p3]} 192 h2=${NETIFS[p4]} 193 194 swp3=${NETIFS[p5]} 195 h3=${NETIFS[p6]} 196 197 h3mac=$(mac_get $h3) 198 199 vrf_prepare 200 defer vrf_cleanup 201 202 h1_create 203 h2_create 204 h3_create 205 switch_create 206} 207 208ping_ipv4() 209{ 210 ping_test $h2 192.0.2.130 211} 212 213__run_uc_measure_rate() 214{ 215 local what=$1; shift 216 local -a uc_rate 217 218 start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac 219 defer stop_traffic $! 220 221 uc_rate=($(measure_rate $swp2 $h3 rx_octets_prio_1 "$what")) 222 check_err $? "Could not get high enough $what ingress rate" 223 224 echo ${uc_rate[@]} 225} 226 227run_uc_measure_rate() 228{ 229 in_defer_scope __run_uc_measure_rate "$@" 230} 231 232test_mc_aware() 233{ 234 RET=0 235 236 local -a uc_rate=($(run_uc_measure_rate "UC-only")) 237 local ucth1=${uc_rate[1]} 238 239 start_traffic $h1 192.0.2.65 bc bc 240 defer stop_traffic $! 241 242 local d0=$(date +%s) 243 local t0=$(ethtool_stats_get $h3 rx_octets_prio_0) 244 local u0=$(ethtool_stats_get $swp1 rx_octets_prio_0) 245 246 local -a uc_rate_2=($(run_uc_measure_rate "UC+MC")) 247 local ucth2=${uc_rate_2[1]} 248 249 local d1=$(date +%s) 250 local t1=$(ethtool_stats_get $h3 rx_octets_prio_0) 251 local u1=$(ethtool_stats_get $swp1 rx_octets_prio_0) 252 253 local deg=$(bc <<< " 254 scale=2 255 ret = 100 * ($ucth1 - $ucth2) / $ucth1 256 if (ret > 0) { ret } else { 0 } 257 ") 258 259 # Minimum shaper of 200Mbps on MC TCs should cause about 20% of 260 # degradation on 1Gbps link. 261 check_err $(bc <<< "$deg < 15") "Minimum shaper not in effect" 262 check_err $(bc <<< "$deg > 25") "MC traffic degrades UC performance too much" 263 264 local interval=$((d1 - d0)) 265 local mc_ir=$(rate $u0 $u1 $interval) 266 local mc_er=$(rate $t0 $t1 $interval) 267 268 log_test "UC performance under MC overload" 269 270 echo "UC-only throughput $(humanize $ucth1)" 271 echo "UC+MC throughput $(humanize $ucth2)" 272 echo "Degradation $deg %" 273 echo 274 echo "Full report:" 275 echo " UC only:" 276 echo " ingress UC throughput $(humanize ${uc_rate[0]})" 277 echo " egress UC throughput $(humanize ${uc_rate[1]})" 278 echo " UC+MC:" 279 echo " ingress UC throughput $(humanize ${uc_rate_2[0]})" 280 echo " egress UC throughput $(humanize ${uc_rate_2[1]})" 281 echo " ingress MC throughput $(humanize $mc_ir)" 282 echo " egress MC throughput $(humanize $mc_er)" 283 echo 284} 285 286test_uc_aware() 287{ 288 RET=0 289 290 start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac 291 defer stop_traffic $! 292 293 local d0=$(date +%s) 294 local t0=$(ethtool_stats_get $h3 rx_octets_prio_1) 295 local u0=$(ethtool_stats_get $swp2 rx_octets_prio_1) 296 sleep 1 297 298 local attempts=50 299 local passes=0 300 local i 301 302 for ((i = 0; i < attempts; ++i)); do 303 if $ARPING -c 1 -I $h1 -b 192.0.2.66 -q -w 1; then 304 ((passes++)) 305 fi 306 307 sleep 0.1 308 done 309 310 local d1=$(date +%s) 311 local t1=$(ethtool_stats_get $h3 rx_octets_prio_1) 312 local u1=$(ethtool_stats_get $swp2 rx_octets_prio_1) 313 314 local interval=$((d1 - d0)) 315 local uc_ir=$(rate $u0 $u1 $interval) 316 local uc_er=$(rate $t0 $t1 $interval) 317 318 ((attempts == passes)) 319 check_err $? 320 321 log_test "MC performance under UC overload" 322 echo " ingress UC throughput $(humanize ${uc_ir})" 323 echo " egress UC throughput $(humanize ${uc_er})" 324 echo " sent $attempts BC ARPs, got $passes responses" 325} 326 327trap cleanup EXIT 328 329setup_prepare 330setup_wait 331 332tests_run 333 334exit $EXIT_STATUS 335