xref: /linux/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py (revision 84318277d6334c6981ab326d4acc87c6a6ddc9b8)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3
4"""
5Devlink Rate TC Bandwidth Test Suite
6===================================
7
8This test suite verifies the functionality of devlink-rate traffic class (TC)
9bandwidth distribution in a virtualized environment. The tests validate that
10bandwidth can be properly allocated between different traffic classes and
11that TC mapping works as expected.
12
13Test Environment:
14----------------
15- Creates 1 VF
16- Establishes a bridge connecting the VF representor and the uplink representor
17- Sets up 2 VLAN interfaces on the VF with different VLAN IDs (101, 102)
18- Configures different traffic classes (TC3 and TC4) for each VLAN
19
20Test Cases:
21----------
221. test_no_tc_mapping_bandwidth:
23   - Verifies that without TC mapping, bandwidth is NOT distributed according to
24     the configured 20/80 split between TC3 and TC4
25   - This test should fail if bandwidth matches the 20/80 split without TC
26     mapping
27   - Expected: Bandwidth should NOT be distributed as 20/80
28
292. test_tc_mapping_bandwidth:
30   - Configures TC mapping using mqprio qdisc
31   - Verifies that with TC mapping, bandwidth IS distributed according to the
32     configured 20/80 split between TC3 and TC4
33   - Expected: Bandwidth should be distributed as 20/80
34
35Bandwidth Distribution:
36----------------------
37- TC3 (VLAN 101): Configured for 20% of total bandwidth
38- TC4 (VLAN 102): Configured for 80% of total bandwidth
39- Total bandwidth: 1Gbps
40- Tolerance: +-12%
41
42Hardware-Specific Behavior (mlx5):
43--------------------------
44mlx5 hardware enforces traffic class separation by ensuring that each transmit
45queue (SQ) is associated with a single TC. If a packet is sent on a queue that
46doesn't match the expected TC (based on DSCP or VLAN priority and hypervisor-set
47mapping), the hardware moves the queue to the correct TC scheduler to preserve
48traffic isolation.
49
50This behavior means that even without explicit TC-to-queue mapping, bandwidth
51enforcement may still appear to work—because the hardware dynamically adjusts
52the scheduling context. However, this can lead to performance issues in high
53rates and HOL blocking if traffic from different TCs is mixed on the same queue.
54"""
55
56import json
57import os
58import subprocess
59import threading
60import time
61
62from lib.py import ksft_pr, ksft_run, ksft_exit
63from lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx
64from lib.py import NetDrvEpEnv, DevlinkFamily
65from lib.py import NlError
66from lib.py import cmd, defer, ethtool, ip
67from lib.py import Iperf3Runner
68
69
70class BandwidthValidator:
71    """
72    Validates total bandwidth and individual shares with tolerance
73    relative to the overall total.
74    """
75
76    def __init__(self, shares):
77        self.tolerance_percent = 12
78        self.expected_total = sum(shares.values())
79        self.bounds = {}
80
81        for name, exp in shares.items():
82            self.bounds[name] = (self.min_expected(exp), self.max_expected(exp))
83
84    def min_expected(self, value):
85        """Calculates the minimum acceptable value based on tolerance."""
86        return value - (self.expected_total * self.tolerance_percent / 100)
87
88    def max_expected(self, value):
89        """Calculates the maximum acceptable value based on tolerance."""
90        return value + (self.expected_total * self.tolerance_percent / 100)
91
92    def bound(self, values):
93        """
94        Return True if all given values fall within tolerance.
95        """
96        for name, value in values.items():
97            low, high = self.bounds[name]
98            if not low <= value <= high:
99                return False
100        return True
101
102
103def setup_vf(cfg, set_tc_mapping=True):
104    """
105    Sets up a VF on the given network interface.
106
107    Enables SR-IOV and switchdev mode, brings the VF interface up,
108    and optionally configures TC mapping using mqprio.
109    """
110    try:
111        cmd(f"devlink dev eswitch set pci/{cfg.pci} mode switchdev")
112        defer(cmd, f"devlink dev eswitch set pci/{cfg.pci} mode legacy")
113    except Exception as exc:
114        raise KsftSkipEx(f"Failed to enable switchdev mode on {cfg.pci}") from exc
115    try:
116        cmd(f"echo 1 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs", shell=True)
117        defer(cmd, f"echo 0 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs", shell=True)
118    except Exception as exc:
119        raise KsftSkipEx(f"Failed to enable SR-IOV on {cfg.ifname}") from exc
120
121    time.sleep(2)
122    vf_ifc = (os.listdir(
123        f"/sys/class/net/{cfg.ifname}/device/virtfn0/net") or [None])[0]
124    if vf_ifc:
125        ip(f"link set dev {vf_ifc} up")
126    else:
127        raise KsftSkipEx("VF interface not found")
128    if set_tc_mapping:
129        cmd(f"tc qdisc add dev {vf_ifc} root handle 5 mqprio mode dcb hw 1 num_tc 8")
130
131    return vf_ifc
132
133
134def setup_vlans_on_vf(vf_ifc):
135    """
136    Sets up two VLAN interfaces on the given VF, each mapped to a different TC.
137    """
138    vlan_configs = [
139        {"vlan_id": 101, "tc": 3, "ip": "198.51.100.1"},
140        {"vlan_id": 102, "tc": 4, "ip": "198.51.100.9"},
141    ]
142
143    for config in vlan_configs:
144        vlan_dev = f"{vf_ifc}.{config['vlan_id']}"
145        ip(f"link add link {vf_ifc} name {vlan_dev} type vlan id {config['vlan_id']}")
146        ip(f"addr add {config['ip']}/29 dev {vlan_dev}")
147        ip(f"link set dev {vlan_dev} up")
148        ip(f"link set dev {vlan_dev} type vlan egress-qos-map 0:{config['tc']}")
149        ksft_pr(f"Created VLAN {vlan_dev} on {vf_ifc} with tc {config['tc']} and IP {config['ip']}")
150
151
152def get_vf_info(cfg):
153    """
154    Finds the VF representor interface and devlink port index
155    for the given PCI device used in the test environment.
156    """
157    cfg.vf_representor = None
158    cfg.vf_port_index = None
159    out = subprocess.check_output(["devlink", "-j", "port", "show"], encoding="utf-8")
160    ports = json.loads(out)["port"]
161
162    for port_name, props in ports.items():
163        netdev = props.get("netdev")
164
165        if (port_name.startswith(f"pci/{cfg.pci}/") and
166            props.get("vfnum") == 0):
167            cfg.vf_representor = netdev
168            cfg.vf_port_index = int(port_name.split("/")[-1])
169            break
170
171
172def setup_bridge(cfg):
173    """
174    Creates and configures a Linux bridge, with both the uplink
175    and VF representor interfaces attached to it.
176    """
177    bridge_name = f"br_{os.getpid()}"
178    ip(f"link add name {bridge_name} type bridge")
179    defer(cmd, f"ip link del name {bridge_name} type bridge")
180
181    ip(f"link set dev {cfg.ifname} master {bridge_name}")
182
183    rep_name = cfg.vf_representor
184    if rep_name:
185        ip(f"link set dev {rep_name} master {bridge_name}")
186        ip(f"link set dev {rep_name} up")
187        ksft_pr(f"Set representor {rep_name} up and added to bridge")
188    else:
189        raise KsftSkipEx("Could not find representor for the VF")
190
191    ip(f"link set dev {bridge_name} up")
192
193
194def setup_devlink_rate(cfg):
195    """
196    Configures devlink rate tx_max and traffic class bandwidth for the VF.
197    """
198    port_index = cfg.vf_port_index
199    if port_index is None:
200        raise KsftSkipEx("Could not find VF port index")
201    try:
202        cfg.devnl.rate_set({
203            "bus-name": "pci",
204            "dev-name": cfg.pci,
205            "port-index": port_index,
206            "rate-tx-max": 125000000,
207            "rate-tc-bws": [
208                {"index": 0, "bw": 0},
209                {"index": 1, "bw": 0},
210                {"index": 2, "bw": 0},
211                {"index": 3, "bw": 20},
212                {"index": 4, "bw": 80},
213                {"index": 5, "bw": 0},
214                {"index": 6, "bw": 0},
215                {"index": 7, "bw": 0},
216            ]
217        })
218    except NlError as exc:
219        if exc.error == 95:  # EOPNOTSUPP
220            raise KsftSkipEx("devlink rate configuration is not supported on the VF") from exc
221        raise KsftFailEx(f"rate_set failed on VF port {port_index}") from exc
222
223
224def setup_remote_vlans(cfg):
225    """
226    Sets up VLAN interfaces on the remote side.
227    """
228    remote_dev = cfg.remote_ifname
229    vlan_ids = [101, 102]
230    remote_ips = ["198.51.100.2", "198.51.100.10"]
231
232    for vlan_id, ip_addr in zip(vlan_ids, remote_ips):
233        vlan_dev = f"{remote_dev}.{vlan_id}"
234        cmd(f"ip link add link {remote_dev} name {vlan_dev} "
235            f"type vlan id {vlan_id}", host=cfg.remote)
236        cmd(f"ip addr add {ip_addr}/29 dev {vlan_dev}", host=cfg.remote)
237        cmd(f"ip link set dev {vlan_dev} up", host=cfg.remote)
238        defer(cmd, f"ip link del {vlan_dev}", host=cfg.remote)
239
240
241def setup_test_environment(cfg, set_tc_mapping=True):
242    """
243    Sets up the complete test environment including VF creation, VLANs,
244    bridge configuration and devlink rate setup.
245    """
246    vf_ifc = setup_vf(cfg, set_tc_mapping)
247    ksft_pr(f"Created VF interface: {vf_ifc}")
248
249    setup_vlans_on_vf(vf_ifc)
250
251    get_vf_info(cfg)
252    setup_bridge(cfg)
253
254    setup_devlink_rate(cfg)
255    setup_remote_vlans(cfg)
256
257
258def measure_bandwidth(cfg, server_ip, client_ip, barrier):
259    """
260    Synchronizes with peers and runs an iperf3-based bandwidth measurement
261    between the given endpoints. Returns average Gbps.
262    """
263    runner = Iperf3Runner(cfg, server_ip=server_ip, client_ip=client_ip)
264    try:
265        barrier.wait(timeout=10)
266    except Exception as exc:
267        raise KsftFailEx("iperf3 barrier wait timed") from exc
268
269    try:
270        bw_gbps = runner.measure_bandwidth(reverse=True)
271    except Exception as exc:
272        raise KsftFailEx("iperf3 bandwidth measurement failed") from exc
273
274    return bw_gbps
275
276
277def run_bandwidth_test(cfg):
278    """
279    Runs parallel bandwidth measurements for each VLAN/TC pair and collects results.
280    """
281    def _run_measure_bandwidth_thread(local_ip, remote_ip, results, barrier, tc_ix):
282        results[tc_ix] = measure_bandwidth(cfg, local_ip, remote_ip, barrier)
283
284    vf_vlan_data = [
285        # (local_ip, remote_ip, TC)
286        ("198.51.100.1",  "198.51.100.2", 3),
287        ("198.51.100.9", "198.51.100.10", 4),
288    ]
289
290    results = {}
291    threads = []
292    start_barrier = threading.Barrier(len(vf_vlan_data))
293
294    for local_ip, remote_ip, tc_ix in vf_vlan_data:
295        thread = threading.Thread(
296            target=_run_measure_bandwidth_thread,
297            args=(local_ip, remote_ip, results, start_barrier, tc_ix)
298        )
299        thread.start()
300        threads.append(thread)
301
302    for thread in threads:
303        thread.join()
304
305    for tc_ix, tc_bw in results.items():
306        if tc_bw is None:
307            raise KsftFailEx("iperf3 failed; cannot evaluate bandwidth")
308
309    return results
310
311
312def calculate_bandwidth_percentages(results):
313    """
314    Calculates the percentage of total bandwidth received by TC3 and TC4.
315    """
316    if 3 not in results or 4 not in results:
317        raise KsftFailEx(f"Missing expected TC results in {results}")
318
319    tc3_bw = results[3]
320    tc4_bw = results[4]
321    total_bw = tc3_bw + tc4_bw
322    tc3_percentage = (tc3_bw / total_bw) * 100
323    tc4_percentage = (tc4_bw / total_bw) * 100
324
325    return {
326        'tc3_bw': tc3_bw,
327        'tc4_bw': tc4_bw,
328        'tc3_percentage': tc3_percentage,
329        'tc4_percentage': tc4_percentage,
330        'total_bw': total_bw
331    }
332
333
334def print_bandwidth_results(bw_data, test_name):
335    """
336    Prints bandwidth measurements and TC usage summary for a given test.
337    """
338    ksft_pr(f"Bandwidth check results {test_name}:")
339    ksft_pr(f"TC 3: {bw_data['tc3_bw']:.2f} Gbits/sec")
340    ksft_pr(f"TC 4: {bw_data['tc4_bw']:.2f} Gbits/sec")
341    ksft_pr(f"Total bandwidth: {bw_data['total_bw']:.2f} Gbits/sec")
342    ksft_pr(f"TC 3 percentage: {bw_data['tc3_percentage']:.1f}%")
343    ksft_pr(f"TC 4 percentage: {bw_data['tc4_percentage']:.1f}%")
344
345
346def verify_total_bandwidth(bw_data, validator):
347    """
348    Ensures the total measured bandwidth falls within the acceptable tolerance.
349    """
350    total = bw_data['total_bw']
351
352    if validator.bound({"total": total}):
353        return
354
355    low, high = validator.bounds["total"]
356
357    if total < low:
358        raise KsftSkipEx(
359            f"Total bandwidth {total:.2f} Gbps < minimum "
360            f"{low:.2f} Gbps; "
361            f"parent tx_max ({validator.expected_total:.1f} G) "
362            f"not reached, cannot validate share"
363        )
364
365    raise KsftFailEx(
366        f"Total bandwidth {total:.2f} Gbps exceeds allowed ceiling "
367        f"{high:.2f} Gbps "
368        f"(VF tx_max set to {validator.expected_total:.1f} G)"
369    )
370
371
372def run_bandwidth_distribution_test(cfg, set_tc_mapping):
373    """
374    Runs parallel bandwidth measurements for both TCs and collects results.
375    """
376    setup_test_environment(cfg, set_tc_mapping)
377    bandwidths = run_bandwidth_test(cfg)
378    bw_data = calculate_bandwidth_percentages(bandwidths)
379    test_name = "with TC mapping" if set_tc_mapping else "without TC mapping"
380    print_bandwidth_results(bw_data, test_name)
381
382    verify_total_bandwidth(bw_data, cfg.traffic_bw_validator)
383
384    return cfg.tc_bw_validator.bound({"tc3": bw_data['tc3_percentage'],
385                                     "tc4": bw_data['tc4_percentage']})
386
387
388def test_no_tc_mapping_bandwidth(cfg):
389    """
390    Verifies that bandwidth is not split 20/80 without traffic class mapping.
391    """
392    pass_bw_msg = "Bandwidth is NOT distributed as 20/80 without TC mapping"
393    fail_bw_msg = "Bandwidth matched 20/80 split without TC mapping"
394    is_mlx5 = "driver: mlx5" in ethtool(f"-i {cfg.ifname}").stdout
395
396    if run_bandwidth_distribution_test(cfg, set_tc_mapping=False):
397        if is_mlx5:
398            raise KsftXfailEx(fail_bw_msg)
399        raise KsftFailEx(fail_bw_msg)
400    if is_mlx5:
401        raise KsftFailEx("mlx5 behavior changed:" + pass_bw_msg)
402    ksft_pr(pass_bw_msg)
403
404
405def test_tc_mapping_bandwidth(cfg):
406    """
407    Verifies that bandwidth is correctly split 20/80 between TC3 and TC4
408    when traffic class mapping is set.
409    """
410    if run_bandwidth_distribution_test(cfg, set_tc_mapping=True):
411        ksft_pr("Bandwidth is distributed as 20/80 with TC mapping")
412    else:
413        raise KsftFailEx("Bandwidth did not match 20/80 split with TC mapping")
414
415
416def main() -> None:
417    """
418    Main entry point for running the test cases.
419    """
420    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
421        cfg.devnl = DevlinkFamily()
422
423        cfg.pci = os.path.basename(
424            os.path.realpath(f"/sys/class/net/{cfg.ifname}/device")
425        )
426        if not cfg.pci:
427            raise KsftSkipEx("Could not get PCI address of the interface")
428
429        cfg.traffic_bw_validator = BandwidthValidator({"total": 1})
430        cfg.tc_bw_validator = BandwidthValidator({"tc3": 20, "tc4": 80})
431
432        cases = [test_no_tc_mapping_bandwidth, test_tc_mapping_bandwidth]
433
434        ksft_run(cases=cases, args=(cfg,))
435    ksft_exit()
436
437
438if __name__ == "__main__":
439    main()
440