1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3 4""" 5Devlink Rate TC Bandwidth Test Suite 6=================================== 7 8This test suite verifies the functionality of devlink-rate traffic class (TC) 9bandwidth distribution in a virtualized environment. The tests validate that 10bandwidth can be properly allocated between different traffic classes and 11that TC mapping works as expected. 12 13Test Environment: 14---------------- 15- Creates 1 VF 16- Establishes a bridge connecting the VF representor and the uplink representor 17- Sets up 2 VLAN interfaces on the VF with different VLAN IDs (101, 102) 18- Configures different traffic classes (TC3 and TC4) for each VLAN 19 20Test Cases: 21---------- 221. test_no_tc_mapping_bandwidth: 23 - Verifies that without TC mapping, bandwidth is NOT distributed according to 24 the configured 20/80 split between TC3 and TC4 25 - This test should fail if bandwidth matches the 20/80 split without TC 26 mapping 27 - Expected: Bandwidth should NOT be distributed as 20/80 28 292. test_tc_mapping_bandwidth: 30 - Configures TC mapping using mqprio qdisc 31 - Verifies that with TC mapping, bandwidth IS distributed according to the 32 configured 20/80 split between TC3 and TC4 33 - Expected: Bandwidth should be distributed as 20/80 34 35Bandwidth Distribution: 36---------------------- 37- TC3 (VLAN 101): Configured for 20% of total bandwidth 38- TC4 (VLAN 102): Configured for 80% of total bandwidth 39- Total bandwidth: 1Gbps 40- Tolerance: +-12% 41 42Hardware-Specific Behavior (mlx5): 43-------------------------- 44mlx5 hardware enforces traffic class separation by ensuring that each transmit 45queue (SQ) is associated with a single TC. If a packet is sent on a queue that 46doesn't match the expected TC (based on DSCP or VLAN priority and hypervisor-set 47mapping), the hardware moves the queue to the correct TC scheduler to preserve 48traffic isolation. 49 50This behavior means that even without explicit TC-to-queue mapping, bandwidth 51enforcement may still appear to work—because the hardware dynamically adjusts 52the scheduling context. However, this can lead to performance issues in high 53rates and HOL blocking if traffic from different TCs is mixed on the same queue. 54""" 55 56import json 57import os 58import subprocess 59import threading 60import time 61 62from lib.py import ksft_pr, ksft_run, ksft_exit 63from lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx 64from lib.py import NetDrvEpEnv, DevlinkFamily 65from lib.py import NlError 66from lib.py import cmd, defer, ethtool, ip 67from lib.py import Iperf3Runner 68 69 70class BandwidthValidator: 71 """ 72 Validates total bandwidth and individual shares with tolerance 73 relative to the overall total. 74 """ 75 76 def __init__(self, shares): 77 self.tolerance_percent = 12 78 self.expected_total = sum(shares.values()) 79 self.bounds = {} 80 81 for name, exp in shares.items(): 82 self.bounds[name] = (self.min_expected(exp), self.max_expected(exp)) 83 84 def min_expected(self, value): 85 """Calculates the minimum acceptable value based on tolerance.""" 86 return value - (self.expected_total * self.tolerance_percent / 100) 87 88 def max_expected(self, value): 89 """Calculates the maximum acceptable value based on tolerance.""" 90 return value + (self.expected_total * self.tolerance_percent / 100) 91 92 def bound(self, values): 93 """ 94 Return True if all given values fall within tolerance. 95 """ 96 for name, value in values.items(): 97 low, high = self.bounds[name] 98 if not low <= value <= high: 99 return False 100 return True 101 102 103def setup_vf(cfg, set_tc_mapping=True): 104 """ 105 Sets up a VF on the given network interface. 106 107 Enables SR-IOV and switchdev mode, brings the VF interface up, 108 and optionally configures TC mapping using mqprio. 109 """ 110 try: 111 cmd(f"devlink dev eswitch set pci/{cfg.pci} mode switchdev") 112 defer(cmd, f"devlink dev eswitch set pci/{cfg.pci} mode legacy") 113 except Exception as exc: 114 raise KsftSkipEx(f"Failed to enable switchdev mode on {cfg.pci}") from exc 115 try: 116 cmd(f"echo 1 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs", shell=True) 117 defer(cmd, f"echo 0 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs", shell=True) 118 except Exception as exc: 119 raise KsftSkipEx(f"Failed to enable SR-IOV on {cfg.ifname}") from exc 120 121 time.sleep(2) 122 vf_ifc = (os.listdir( 123 f"/sys/class/net/{cfg.ifname}/device/virtfn0/net") or [None])[0] 124 if vf_ifc: 125 ip(f"link set dev {vf_ifc} up") 126 else: 127 raise KsftSkipEx("VF interface not found") 128 if set_tc_mapping: 129 cmd(f"tc qdisc add dev {vf_ifc} root handle 5 mqprio mode dcb hw 1 num_tc 8") 130 131 return vf_ifc 132 133 134def setup_vlans_on_vf(vf_ifc): 135 """ 136 Sets up two VLAN interfaces on the given VF, each mapped to a different TC. 137 """ 138 vlan_configs = [ 139 {"vlan_id": 101, "tc": 3, "ip": "198.51.100.1"}, 140 {"vlan_id": 102, "tc": 4, "ip": "198.51.100.9"}, 141 ] 142 143 for config in vlan_configs: 144 vlan_dev = f"{vf_ifc}.{config['vlan_id']}" 145 ip(f"link add link {vf_ifc} name {vlan_dev} type vlan id {config['vlan_id']}") 146 ip(f"addr add {config['ip']}/29 dev {vlan_dev}") 147 ip(f"link set dev {vlan_dev} up") 148 ip(f"link set dev {vlan_dev} type vlan egress-qos-map 0:{config['tc']}") 149 ksft_pr(f"Created VLAN {vlan_dev} on {vf_ifc} with tc {config['tc']} and IP {config['ip']}") 150 151 152def get_vf_info(cfg): 153 """ 154 Finds the VF representor interface and devlink port index 155 for the given PCI device used in the test environment. 156 """ 157 cfg.vf_representor = None 158 cfg.vf_port_index = None 159 out = subprocess.check_output(["devlink", "-j", "port", "show"], encoding="utf-8") 160 ports = json.loads(out)["port"] 161 162 for port_name, props in ports.items(): 163 netdev = props.get("netdev") 164 165 if (port_name.startswith(f"pci/{cfg.pci}/") and 166 props.get("vfnum") == 0): 167 cfg.vf_representor = netdev 168 cfg.vf_port_index = int(port_name.split("/")[-1]) 169 break 170 171 172def setup_bridge(cfg): 173 """ 174 Creates and configures a Linux bridge, with both the uplink 175 and VF representor interfaces attached to it. 176 """ 177 bridge_name = f"br_{os.getpid()}" 178 ip(f"link add name {bridge_name} type bridge") 179 defer(cmd, f"ip link del name {bridge_name} type bridge") 180 181 ip(f"link set dev {cfg.ifname} master {bridge_name}") 182 183 rep_name = cfg.vf_representor 184 if rep_name: 185 ip(f"link set dev {rep_name} master {bridge_name}") 186 ip(f"link set dev {rep_name} up") 187 ksft_pr(f"Set representor {rep_name} up and added to bridge") 188 else: 189 raise KsftSkipEx("Could not find representor for the VF") 190 191 ip(f"link set dev {bridge_name} up") 192 193 194def setup_devlink_rate(cfg): 195 """ 196 Configures devlink rate tx_max and traffic class bandwidth for the VF. 197 """ 198 port_index = cfg.vf_port_index 199 if port_index is None: 200 raise KsftSkipEx("Could not find VF port index") 201 try: 202 cfg.devnl.rate_set({ 203 "bus-name": "pci", 204 "dev-name": cfg.pci, 205 "port-index": port_index, 206 "rate-tx-max": 125000000, 207 "rate-tc-bws": [ 208 {"index": 0, "bw": 0}, 209 {"index": 1, "bw": 0}, 210 {"index": 2, "bw": 0}, 211 {"index": 3, "bw": 20}, 212 {"index": 4, "bw": 80}, 213 {"index": 5, "bw": 0}, 214 {"index": 6, "bw": 0}, 215 {"index": 7, "bw": 0}, 216 ] 217 }) 218 except NlError as exc: 219 if exc.error == 95: # EOPNOTSUPP 220 raise KsftSkipEx("devlink rate configuration is not supported on the VF") from exc 221 raise KsftFailEx(f"rate_set failed on VF port {port_index}") from exc 222 223 224def setup_remote_vlans(cfg): 225 """ 226 Sets up VLAN interfaces on the remote side. 227 """ 228 remote_dev = cfg.remote_ifname 229 vlan_ids = [101, 102] 230 remote_ips = ["198.51.100.2", "198.51.100.10"] 231 232 for vlan_id, ip_addr in zip(vlan_ids, remote_ips): 233 vlan_dev = f"{remote_dev}.{vlan_id}" 234 cmd(f"ip link add link {remote_dev} name {vlan_dev} " 235 f"type vlan id {vlan_id}", host=cfg.remote) 236 cmd(f"ip addr add {ip_addr}/29 dev {vlan_dev}", host=cfg.remote) 237 cmd(f"ip link set dev {vlan_dev} up", host=cfg.remote) 238 defer(cmd, f"ip link del {vlan_dev}", host=cfg.remote) 239 240 241def setup_test_environment(cfg, set_tc_mapping=True): 242 """ 243 Sets up the complete test environment including VF creation, VLANs, 244 bridge configuration and devlink rate setup. 245 """ 246 vf_ifc = setup_vf(cfg, set_tc_mapping) 247 ksft_pr(f"Created VF interface: {vf_ifc}") 248 249 setup_vlans_on_vf(vf_ifc) 250 251 get_vf_info(cfg) 252 setup_bridge(cfg) 253 254 setup_devlink_rate(cfg) 255 setup_remote_vlans(cfg) 256 257 258def measure_bandwidth(cfg, server_ip, client_ip, barrier): 259 """ 260 Synchronizes with peers and runs an iperf3-based bandwidth measurement 261 between the given endpoints. Returns average Gbps. 262 """ 263 runner = Iperf3Runner(cfg, server_ip=server_ip, client_ip=client_ip) 264 try: 265 barrier.wait(timeout=10) 266 except Exception as exc: 267 raise KsftFailEx("iperf3 barrier wait timed") from exc 268 269 try: 270 bw_gbps = runner.measure_bandwidth(reverse=True) 271 except Exception as exc: 272 raise KsftFailEx("iperf3 bandwidth measurement failed") from exc 273 274 return bw_gbps 275 276 277def run_bandwidth_test(cfg): 278 """ 279 Runs parallel bandwidth measurements for each VLAN/TC pair and collects results. 280 """ 281 def _run_measure_bandwidth_thread(local_ip, remote_ip, results, barrier, tc_ix): 282 results[tc_ix] = measure_bandwidth(cfg, local_ip, remote_ip, barrier) 283 284 vf_vlan_data = [ 285 # (local_ip, remote_ip, TC) 286 ("198.51.100.1", "198.51.100.2", 3), 287 ("198.51.100.9", "198.51.100.10", 4), 288 ] 289 290 results = {} 291 threads = [] 292 start_barrier = threading.Barrier(len(vf_vlan_data)) 293 294 for local_ip, remote_ip, tc_ix in vf_vlan_data: 295 thread = threading.Thread( 296 target=_run_measure_bandwidth_thread, 297 args=(local_ip, remote_ip, results, start_barrier, tc_ix) 298 ) 299 thread.start() 300 threads.append(thread) 301 302 for thread in threads: 303 thread.join() 304 305 for tc_ix, tc_bw in results.items(): 306 if tc_bw is None: 307 raise KsftFailEx("iperf3 failed; cannot evaluate bandwidth") 308 309 return results 310 311 312def calculate_bandwidth_percentages(results): 313 """ 314 Calculates the percentage of total bandwidth received by TC3 and TC4. 315 """ 316 if 3 not in results or 4 not in results: 317 raise KsftFailEx(f"Missing expected TC results in {results}") 318 319 tc3_bw = results[3] 320 tc4_bw = results[4] 321 total_bw = tc3_bw + tc4_bw 322 tc3_percentage = (tc3_bw / total_bw) * 100 323 tc4_percentage = (tc4_bw / total_bw) * 100 324 325 return { 326 'tc3_bw': tc3_bw, 327 'tc4_bw': tc4_bw, 328 'tc3_percentage': tc3_percentage, 329 'tc4_percentage': tc4_percentage, 330 'total_bw': total_bw 331 } 332 333 334def print_bandwidth_results(bw_data, test_name): 335 """ 336 Prints bandwidth measurements and TC usage summary for a given test. 337 """ 338 ksft_pr(f"Bandwidth check results {test_name}:") 339 ksft_pr(f"TC 3: {bw_data['tc3_bw']:.2f} Gbits/sec") 340 ksft_pr(f"TC 4: {bw_data['tc4_bw']:.2f} Gbits/sec") 341 ksft_pr(f"Total bandwidth: {bw_data['total_bw']:.2f} Gbits/sec") 342 ksft_pr(f"TC 3 percentage: {bw_data['tc3_percentage']:.1f}%") 343 ksft_pr(f"TC 4 percentage: {bw_data['tc4_percentage']:.1f}%") 344 345 346def verify_total_bandwidth(bw_data, validator): 347 """ 348 Ensures the total measured bandwidth falls within the acceptable tolerance. 349 """ 350 total = bw_data['total_bw'] 351 352 if validator.bound({"total": total}): 353 return 354 355 low, high = validator.bounds["total"] 356 357 if total < low: 358 raise KsftSkipEx( 359 f"Total bandwidth {total:.2f} Gbps < minimum " 360 f"{low:.2f} Gbps; " 361 f"parent tx_max ({validator.expected_total:.1f} G) " 362 f"not reached, cannot validate share" 363 ) 364 365 raise KsftFailEx( 366 f"Total bandwidth {total:.2f} Gbps exceeds allowed ceiling " 367 f"{high:.2f} Gbps " 368 f"(VF tx_max set to {validator.expected_total:.1f} G)" 369 ) 370 371 372def run_bandwidth_distribution_test(cfg, set_tc_mapping): 373 """ 374 Runs parallel bandwidth measurements for both TCs and collects results. 375 """ 376 setup_test_environment(cfg, set_tc_mapping) 377 bandwidths = run_bandwidth_test(cfg) 378 bw_data = calculate_bandwidth_percentages(bandwidths) 379 test_name = "with TC mapping" if set_tc_mapping else "without TC mapping" 380 print_bandwidth_results(bw_data, test_name) 381 382 verify_total_bandwidth(bw_data, cfg.traffic_bw_validator) 383 384 return cfg.tc_bw_validator.bound({"tc3": bw_data['tc3_percentage'], 385 "tc4": bw_data['tc4_percentage']}) 386 387 388def test_no_tc_mapping_bandwidth(cfg): 389 """ 390 Verifies that bandwidth is not split 20/80 without traffic class mapping. 391 """ 392 pass_bw_msg = "Bandwidth is NOT distributed as 20/80 without TC mapping" 393 fail_bw_msg = "Bandwidth matched 20/80 split without TC mapping" 394 is_mlx5 = "driver: mlx5" in ethtool(f"-i {cfg.ifname}").stdout 395 396 if run_bandwidth_distribution_test(cfg, set_tc_mapping=False): 397 if is_mlx5: 398 raise KsftXfailEx(fail_bw_msg) 399 raise KsftFailEx(fail_bw_msg) 400 if is_mlx5: 401 raise KsftFailEx("mlx5 behavior changed:" + pass_bw_msg) 402 ksft_pr(pass_bw_msg) 403 404 405def test_tc_mapping_bandwidth(cfg): 406 """ 407 Verifies that bandwidth is correctly split 20/80 between TC3 and TC4 408 when traffic class mapping is set. 409 """ 410 if run_bandwidth_distribution_test(cfg, set_tc_mapping=True): 411 ksft_pr("Bandwidth is distributed as 20/80 with TC mapping") 412 else: 413 raise KsftFailEx("Bandwidth did not match 20/80 split with TC mapping") 414 415 416def main() -> None: 417 """ 418 Main entry point for running the test cases. 419 """ 420 with NetDrvEpEnv(__file__, nsim_test=False) as cfg: 421 cfg.devnl = DevlinkFamily() 422 423 cfg.pci = os.path.basename( 424 os.path.realpath(f"/sys/class/net/{cfg.ifname}/device") 425 ) 426 if not cfg.pci: 427 raise KsftSkipEx("Could not get PCI address of the interface") 428 429 cfg.traffic_bw_validator = BandwidthValidator({"total": 1}) 430 cfg.tc_bw_validator = BandwidthValidator({"tc3": 20, "tc4": 80}) 431 432 cases = [test_no_tc_mapping_bandwidth, test_tc_mapping_bandwidth] 433 434 ksft_run(cases=cases, args=(cfg,)) 435 ksft_exit() 436 437 438if __name__ == "__main__": 439 main() 440