xref: /linux/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh (revision 7f71507851fc7764b36a3221839607d3a45c2025)
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# A test for switch behavior under MC overload. An issue in Spectrum chips
5# causes throughput of UC traffic to drop severely when a switch is under heavy
6# MC load. This issue can be overcome by putting the switch to MC-aware mode.
7# This test verifies that UC performance stays intact even as the switch is
8# under MC flood, and therefore that the MC-aware mode is enabled and correctly
9# configured.
10#
11# Because mlxsw throttles CPU port, the traffic can't actually reach userspace
12# at full speed. That makes it impossible to use iperf3 to simply measure the
13# throughput, because many packets (that reach $h3) don't get to the kernel at
14# all even in UDP mode (the situation is even worse in TCP mode, where one can't
15# hope to see more than a couple Mbps).
16#
17# So instead we send traffic with mausezahn and use RX ethtool counters at $h3.
18# Multicast traffic is untagged, unicast traffic is tagged with PCP 1. Therefore
19# each gets a different priority and we can use per-prio ethtool counters to
20# measure the throughput. In order to avoid prioritizing unicast traffic, prio
21# qdisc is installed on $swp3 and maps all priorities to the same band #7 (and
22# thus TC 0).
23#
24# Mausezahn can't actually saturate the links unless it's using large frames.
25# Thus we set MTU to 10K on all involved interfaces. Then both unicast and
26# multicast traffic uses 8K frames.
27#
28# +---------------------------+            +----------------------------------+
29# | H1                        |            |                               H2 |
30# |                           |            |  unicast --> + $h2.111           |
31# |                 multicast |            |  traffic     | 192.0.2.129/28    |
32# |                 traffic   |            |              | e-qos-map 0:1     |
33# |           $h1 + <-----    |            |              |                   |
34# | 192.0.2.65/28 |           |            |              + $h2               |
35# +---------------|-----------+            +--------------|-------------------+
36#                 |                                       |
37# +---------------|---------------------------------------|-------------------+
38# |         $swp1 +                                       + $swp2             |
39# |        >1Gbps |                                       | >1Gbps            |
40# | +-------------|------+                     +----------|----------------+  |
41# | |     $swp1.1 +      |                     |          + $swp2.111      |  |
42# | |                BR1 |             SW      | BR111                     |  |
43# | |     $swp3.1 +      |                     |          + $swp3.111      |  |
44# | +-------------|------+                     +----------|----------------+  |
45# |               \_______________________________________/                   |
46# |                                    |                                      |
47# |                                    + $swp3                                |
48# |                                    | 1Gbps bottleneck                     |
49# |                                    | prio qdisc: {0..7} -> 7              |
50# +------------------------------------|--------------------------------------+
51#                                      |
52#                                   +--|-----------------+
53#                                   |  + $h3          H3 |
54#                                   |  | 192.0.2.66/28   |
55#                                   |  |                 |
56#                                   |  + $h3.111         |
57#                                   |    192.0.2.130/28  |
58#                                   +--------------------+
59
60ALL_TESTS="
61	ping_ipv4
62	test_mc_aware
63	test_uc_aware
64"
65
66lib_dir=$(dirname $0)/../../../net/forwarding
67
68NUM_NETIFS=6
69source $lib_dir/lib.sh
70source $lib_dir/devlink_lib.sh
71source qos_lib.sh
72
73h1_create()
74{
75	simple_if_init $h1 192.0.2.65/28
76	defer simple_if_fini $h1 192.0.2.65/28
77
78	mtu_set $h1 10000
79	defer mtu_restore $h1
80}
81
82h2_create()
83{
84	simple_if_init $h2
85	defer simple_if_fini $h2
86
87	mtu_set $h2 10000
88	defer mtu_restore $h2
89
90	vlan_create $h2 111 v$h2 192.0.2.129/28
91	defer vlan_destroy $h2 111
92	ip link set dev $h2.111 type vlan egress-qos-map 0:1
93}
94
95h3_create()
96{
97	simple_if_init $h3 192.0.2.66/28
98	defer simple_if_fini $h3 192.0.2.66/28
99
100	mtu_set $h3 10000
101	defer mtu_restore $h3
102
103	vlan_create $h3 111 v$h3 192.0.2.130/28
104	defer vlan_destroy $h3 111
105}
106
107switch_create()
108{
109	ip link set dev $swp1 up
110	defer ip link set dev $swp1 down
111
112	mtu_set $swp1 10000
113	defer mtu_restore $swp1
114
115	ip link set dev $swp2 up
116	defer ip link set dev $swp2 down
117
118	mtu_set $swp2 10000
119	defer mtu_restore $swp2
120
121	ip link set dev $swp3 up
122	defer ip link set dev $swp3 down
123
124	mtu_set $swp3 10000
125	defer mtu_restore $swp3
126
127	vlan_create $swp2 111
128	defer vlan_destroy $swp2 111
129
130	vlan_create $swp3 111
131	defer vlan_destroy $swp3 111
132
133	tc qdisc replace dev $swp3 root handle 3: tbf rate 1gbit \
134		burst 128K limit 1G
135	defer tc qdisc del dev $swp3 root handle 3:
136
137	tc qdisc replace dev $swp3 parent 3:3 handle 33: \
138		prio bands 8 priomap 7 7 7 7 7 7 7 7
139	defer tc qdisc del dev $swp3 parent 3:3 handle 33:
140
141	ip link add name br1 type bridge vlan_filtering 0
142	defer ip link del dev br1
143	ip link set dev br1 addrgenmode none
144	ip link set dev br1 up
145
146	ip link set dev $swp1 master br1
147	defer ip link set dev $swp1 nomaster
148
149	ip link set dev $swp3 master br1
150	defer ip link set dev $swp3 nomaster
151
152	ip link add name br111 type bridge vlan_filtering 0
153	defer ip link del dev br111
154	ip link set dev br111 addrgenmode none
155	ip link set dev br111 up
156
157	ip link set dev $swp2.111 master br111
158	defer ip link set dev $swp2.111 nomaster
159
160	ip link set dev $swp3.111 master br111
161	defer ip link set dev $swp3.111 nomaster
162
163	# Make sure that ingress quotas are smaller than egress so that there is
164	# room for both streams of traffic to be admitted to shared buffer.
165	devlink_port_pool_th_save $swp1 0
166	devlink_port_pool_th_set $swp1 0 5
167	defer devlink_port_pool_th_restore $swp1 0
168
169	devlink_tc_bind_pool_th_save $swp1 0 ingress
170	devlink_tc_bind_pool_th_set $swp1 0 ingress 0 5
171	defer devlink_tc_bind_pool_th_restore $swp1 0 ingress
172
173	devlink_port_pool_th_save $swp2 0
174	devlink_port_pool_th_set $swp2 0 5
175	defer devlink_port_pool_th_restore $swp2 0
176
177	devlink_tc_bind_pool_th_save $swp2 1 ingress
178	devlink_tc_bind_pool_th_set $swp2 1 ingress 0 5
179	defer devlink_tc_bind_pool_th_restore $swp2 1 ingress
180
181	devlink_port_pool_th_save $swp3 4
182	devlink_port_pool_th_set $swp3 4 12
183	defer devlink_port_pool_th_restore $swp3 4
184}
185
186setup_prepare()
187{
188	h1=${NETIFS[p1]}
189	swp1=${NETIFS[p2]}
190
191	swp2=${NETIFS[p3]}
192	h2=${NETIFS[p4]}
193
194	swp3=${NETIFS[p5]}
195	h3=${NETIFS[p6]}
196
197	h3mac=$(mac_get $h3)
198
199	vrf_prepare
200	defer vrf_cleanup
201
202	h1_create
203	h2_create
204	h3_create
205	switch_create
206}
207
208ping_ipv4()
209{
210	ping_test $h2 192.0.2.130
211}
212
213__run_uc_measure_rate()
214{
215	local what=$1; shift
216	local -a uc_rate
217
218	start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac
219	defer stop_traffic $!
220
221	uc_rate=($(measure_rate $swp2 $h3 rx_octets_prio_1 "$what"))
222	check_err $? "Could not get high enough $what ingress rate"
223
224	echo ${uc_rate[@]}
225}
226
227run_uc_measure_rate()
228{
229	in_defer_scope __run_uc_measure_rate "$@"
230}
231
232test_mc_aware()
233{
234	RET=0
235
236	local -a uc_rate=($(run_uc_measure_rate "UC-only"))
237	local ucth1=${uc_rate[1]}
238
239	start_traffic $h1 192.0.2.65 bc bc
240	defer stop_traffic $!
241
242	local d0=$(date +%s)
243	local t0=$(ethtool_stats_get $h3 rx_octets_prio_0)
244	local u0=$(ethtool_stats_get $swp1 rx_octets_prio_0)
245
246	local -a uc_rate_2=($(run_uc_measure_rate "UC+MC"))
247	local ucth2=${uc_rate_2[1]}
248
249	local d1=$(date +%s)
250	local t1=$(ethtool_stats_get $h3 rx_octets_prio_0)
251	local u1=$(ethtool_stats_get $swp1 rx_octets_prio_0)
252
253	local deg=$(bc <<< "
254			scale=2
255			ret = 100 * ($ucth1 - $ucth2) / $ucth1
256			if (ret > 0) { ret } else { 0 }
257		    ")
258
259	# Minimum shaper of 200Mbps on MC TCs should cause about 20% of
260	# degradation on 1Gbps link.
261	check_err $(bc <<< "$deg < 15") "Minimum shaper not in effect"
262	check_err $(bc <<< "$deg > 25") "MC traffic degrades UC performance too much"
263
264	local interval=$((d1 - d0))
265	local mc_ir=$(rate $u0 $u1 $interval)
266	local mc_er=$(rate $t0 $t1 $interval)
267
268	log_test "UC performance under MC overload"
269
270	echo "UC-only throughput  $(humanize $ucth1)"
271	echo "UC+MC throughput    $(humanize $ucth2)"
272	echo "Degradation         $deg %"
273	echo
274	echo "Full report:"
275	echo "  UC only:"
276	echo "    ingress UC throughput $(humanize ${uc_rate[0]})"
277	echo "    egress UC throughput  $(humanize ${uc_rate[1]})"
278	echo "  UC+MC:"
279	echo "    ingress UC throughput $(humanize ${uc_rate_2[0]})"
280	echo "    egress UC throughput  $(humanize ${uc_rate_2[1]})"
281	echo "    ingress MC throughput $(humanize $mc_ir)"
282	echo "    egress MC throughput  $(humanize $mc_er)"
283	echo
284}
285
286test_uc_aware()
287{
288	RET=0
289
290	start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac
291	defer stop_traffic $!
292
293	local d0=$(date +%s)
294	local t0=$(ethtool_stats_get $h3 rx_octets_prio_1)
295	local u0=$(ethtool_stats_get $swp2 rx_octets_prio_1)
296	sleep 1
297
298	local attempts=50
299	local passes=0
300	local i
301
302	for ((i = 0; i < attempts; ++i)); do
303		if $ARPING -c 1 -I $h1 -b 192.0.2.66 -q -w 1; then
304			((passes++))
305		fi
306
307		sleep 0.1
308	done
309
310	local d1=$(date +%s)
311	local t1=$(ethtool_stats_get $h3 rx_octets_prio_1)
312	local u1=$(ethtool_stats_get $swp2 rx_octets_prio_1)
313
314	local interval=$((d1 - d0))
315	local uc_ir=$(rate $u0 $u1 $interval)
316	local uc_er=$(rate $t0 $t1 $interval)
317
318	((attempts == passes))
319	check_err $?
320
321	log_test "MC performance under UC overload"
322	echo "    ingress UC throughput $(humanize ${uc_ir})"
323	echo "    egress UC throughput  $(humanize ${uc_er})"
324	echo "    sent $attempts BC ARPs, got $passes responses"
325}
326
327trap cleanup EXIT
328
329setup_prepare
330setup_wait
331
332tests_run
333
334exit $EXIT_STATUS
335