1#!/bin/bash 2# SPDX-License-Identifier: GPL-2.0 3# 4# Test local ECMP path re-selection on TCP retransmission timeout and PLB. 5# 6# Two namespaces connected by two parallel veth pairs with a 2-way ECMP 7# route. When a TCP path is blocked (via tc drop) or congested (via 8# netem ECN marking), the kernel rehashes the connection via 9# sk_rethink_txhash() + __sk_dst_reset(), causing the next route lookup 10# to select the other ECMP path. 11# 12# Expected runtime: ~60 seconds. Most time is spent waiting for TCP 13# retransmission timeouts (1-7s per test) and running multi-round 14# consistency checks (10 rounds each). The large slowwait/connect-timeout 15# values (30-120s) are worst-case bounds for CI; a correctly functioning 16# kernel reaches each check well before the timeout expires. 17 18source lib.sh 19 20SUBNETS=(a b) 21PORT=9900 22: "${ECMP_REBUILD_ROUNDS:=10}" 23 24# alloc_ports NAME [COUNT]: set NAME to the next free port and reserve 25# COUNT ports (default 1) from a shared counter. Each test allocates its 26# own port(s) where it runs, so a retry or a newly added test never 27# collides; the per-round tests reserve ECMP_REBUILD_ROUNDS each. 28NEXT_PORT=$PORT 29alloc_ports() 30{ 31 printf -v "$1" '%d' "$NEXT_PORT" 32 NEXT_PORT=$((NEXT_PORT + ${2:-1})) 33} 34 35ALL_TESTS=" 36 test_ecmp_syn_rehash 37 test_ecmp_synack_rehash 38 test_ecmp_midstream_rehash 39 test_ecmp_midstream_ack_rehash 40 test_ecmp_plb_rehash 41 test_ecmp_hash_policy1_no_rehash 42 test_ecmp_no_flowlabel_leak 43 test_ecmp_dst_rebuild_consistency 44 test_ecmp_syncookie_path_consistency 45" 46 47link_tx_packets_get() 48{ 49 local ns=$1; shift 50 local dev=$1; shift 51 52 ip netns exec "$ns" cat "/sys/class/net/$dev/statistics/tx_packets" 53} 54 55# Return the number of packets matched by the tc filter action on a device. 56# When tc drops packets via "action drop", the device's tx_packets is not 57# incremented (packet never reaches veth_xmit), but the tc action maintains 58# its own counter. 59tc_filter_pkt_count() 60{ 61 local ns=$1; shift 62 local dev=$1; shift 63 64 ip netns exec "$ns" tc -s filter show dev "$dev" parent 1: 2>/dev/null | 65 awk '/Sent .* pkt/ { 66 for (i=1; i<=NF; i++) 67 if ($i == "pkt") { print $(i-1); exit } 68 }' 69} 70 71# Read a TcpExt counter from /proc/net/netstat in a namespace. 72# Returns 0 if the counter is not found. 73get_netstat_counter() 74{ 75 local ns=$1; shift 76 local field=$1; shift 77 local val 78 79 # shellcheck disable=SC2016 80 val=$(ip netns exec "$ns" awk -v key="$field" ' 81 /^TcpExt:/ { 82 if (!h) { split($0, n); h=1 } 83 else { 84 split($0, v) 85 for (i in n) 86 if (n[i] == key) print v[i] 87 } 88 } 89 ' /proc/net/netstat) 90 echo "${val:-0}" 91} 92 93# Apply netem ECN marking: CE-mark all ECT packets instead of dropping them. 94mark_ecn() 95{ 96 local ns=$1; shift 97 local dev=$1; shift 98 99 ip netns exec "$ns" tc qdisc add dev "$dev" root netem loss 100% ecn 100} 101 102# Block TCP (IPv6 next-header = 6) egress, allowing ICMPv6 through. 103block_tcp() 104{ 105 local ns=$1; shift 106 local dev=$1; shift 107 108 ip netns exec "$ns" tc qdisc add dev "$dev" root handle 1: prio 109 ip netns exec "$ns" tc filter add dev "$dev" parent 1: \ 110 protocol ipv6 prio 1 u32 match u8 0x06 0xff at 6 action drop 111} 112 113unblock_tcp() 114{ 115 local ns=$1; shift 116 local dev=$1; shift 117 118 ip netns exec "$ns" tc qdisc del dev "$dev" root 2>/dev/null 119} 120 121# Return success when a device's TX counter exceeds a baseline value. 122dev_tx_packets_above() 123{ 124 local ns=$1; shift 125 local dev=$1; shift 126 local baseline=$1; shift 127 128 local cur 129 cur=$(link_tx_packets_get "$ns" "$dev") 130 [ "$cur" -gt "$baseline" ] 131} 132 133# Return success when both devices have dropped at least one TCP packet. 134both_devs_attempted() 135{ 136 local ns=$1; shift 137 local dev0=$1; shift 138 local dev1=$1; shift 139 140 local c0 c1 141 c0=$(tc_filter_pkt_count "$ns" "$dev0") 142 c1=$(tc_filter_pkt_count "$ns" "$dev1") 143 [ "${c0:-0}" -ge 1 ] && [ "${c1:-0}" -ge 1 ] 144} 145 146link_tx_packets_total() 147{ 148 local ns=$1; shift 149 local dev0=${1:-veth0a}; shift 2>/dev/null 150 local dev1=${1:-veth1a} 151 152 echo $(( $(link_tx_packets_get "$ns" "$dev0") + 153 $(link_tx_packets_get "$ns" "$dev1") )) 154} 155 156# (Re)install the ECMP multipath routes between NS1 and NS2. $1 is the 157# ip route operation ("add" to create, "change" to replace). If $2 is 158# given it names a congestion control to pin on both routes via "congctl"; 159# because dctcp carries TCP_CONG_NEEDS_ECN, this also tags the route with 160# DST_FEATURE_ECN_CA, which makes the server negotiate ECN without the 161# listener itself having to run dctcp. The nexthop topology lives here 162# only, so a test can re-pin the routes and restore them with one call. 163install_ecmp_routes() 164{ 165 local op=$1 cc=$2 166 local -a cc_attr=() 167 168 [ -n "$cc" ] && cc_attr=(congctl "$cc") 169 170 ip -n "$NS1" -6 route "$op" fd00:ff::2/128 "${cc_attr[@]}" \ 171 nexthop via fd00:a::2 dev veth0a \ 172 nexthop via fd00:b::2 dev veth1a 173 174 ip -n "$NS2" -6 route "$op" fd00:ff::1/128 "${cc_attr[@]}" \ 175 nexthop via fd00:a::1 dev veth0b \ 176 nexthop via fd00:b::1 dev veth1b 177} 178 179setup() 180{ 181 setup_ns NS1 NS2 182 183 local ns 184 for ns in "$NS1" "$NS2"; do 185 ip netns exec "$ns" sysctl -qw net.ipv6.conf.all.accept_dad=0 186 ip netns exec "$ns" sysctl -qw net.ipv6.conf.default.accept_dad=0 187 ip netns exec "$ns" sysctl -qw net.ipv6.conf.all.forwarding=1 188 ip netns exec "$ns" sysctl -qw net.core.txrehash=1 189 done 190 191 local i sub 192 for i in 0 1; do 193 sub=${SUBNETS[$i]} 194 ip link add "veth${i}a" type veth peer name "veth${i}b" 195 ip link set "veth${i}a" netns "$NS1" 196 ip link set "veth${i}b" netns "$NS2" 197 ip -n "$NS1" addr add "fd00:${sub}::1/64" dev "veth${i}a" 198 ip -n "$NS2" addr add "fd00:${sub}::2/64" dev "veth${i}b" 199 ip -n "$NS1" link set "veth${i}a" up 200 ip -n "$NS2" link set "veth${i}b" up 201 done 202 203 ip -n "$NS1" addr add fd00:ff::1/128 dev lo 204 ip -n "$NS2" addr add fd00:ff::2/128 dev lo 205 206 # Allow many SYN retries at 1-second intervals (linear, no 207 # exponential backoff) so the rehash test has enough attempts 208 # to exercise both ECMP paths. 209 if ! ip netns exec "$NS1" sysctl -qw \ 210 net.ipv4.tcp_syn_linear_timeouts=25; then 211 echo "SKIP: tcp_syn_linear_timeouts not supported" 212 return "$ksft_skip" 213 fi 214 ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_syn_retries=25 215 216 # Keep the server's request socket alive during the blocking 217 # period so SYN/ACK retransmits continue. 218 ip netns exec "$NS2" sysctl -qw net.ipv4.tcp_synack_retries=25 219 220 install_ecmp_routes add 221 222 for i in 0 1; do 223 sub=${SUBNETS[$i]} 224 ip netns exec "$NS1" \ 225 ping -6 -c1 -W5 "fd00:${sub}::2" &>/dev/null 226 ip netns exec "$NS2" \ 227 ping -6 -c1 -W5 "fd00:${sub}::1" &>/dev/null 228 done 229 230 if ! ip netns exec "$NS1" ping -6 -c1 -W5 fd00:ff::2 &>/dev/null; then 231 echo "Basic connectivity check failed" 232 return "$ksft_skip" 233 fi 234} 235 236# Block ALL paths, start a connection, wait until SYNs have been dropped 237# on both interfaces (proving rehash steered the SYN to a new path), then 238# unblock so the connection completes. 239test_ecmp_syn_rehash() 240{ 241 RET=0 242 local port 243 alloc_ports port 244 245 block_tcp "$NS1" veth0a 246 defer unblock_tcp "$NS1" veth0a 247 block_tcp "$NS1" veth1a 248 defer unblock_tcp "$NS1" veth1a 249 250 ip netns exec "$NS2" socat \ 251 "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr,fork" \ 252 EXEC:"echo ESTABLISH_OK" & 253 defer kill_process $! 254 255 wait_local_port_listen "$NS2" "$port" tcp 256 257 local rehash_before 258 rehash_before=$(get_netstat_counter "$NS1" TcpTimeoutRehash) 259 260 # Start the connection in the background; it will retry SYNs at 261 # 1-second intervals until an unblocked path is found. 262 # Use -u (unidirectional) to only receive from the server; 263 # sending data back would risk SIGPIPE if the server's EXEC 264 # child has already exited. 265 local tmpfile 266 tmpfile=$(mktemp) 267 defer rm -f "$tmpfile" 268 269 ip netns exec "$NS1" socat -u \ 270 "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],connect-timeout=60" \ 271 STDOUT >"$tmpfile" 2>&1 & 272 local client_pid=$! 273 defer kill_process "$client_pid" 274 275 # Wait until both paths have seen at least one dropped SYN. 276 # This proves sk_rethink_txhash() rehashed the connection from 277 # one ECMP path to the other. 278 slowwait 30 both_devs_attempted "$NS1" veth0a veth1a > /dev/null 279 check_err $? "SYNs did not appear on both paths (rehash not working)" 280 if [ "$RET" -ne 0 ]; then 281 log_test "Local ECMP SYN rehash: establish with blocked paths" 282 return 283 fi 284 285 # Unblock both paths and let the next SYN retransmit succeed. 286 unblock_tcp "$NS1" veth0a 287 unblock_tcp "$NS1" veth1a 288 289 local rc=0 290 wait "$client_pid" || rc=$? 291 292 local result 293 result=$(cat "$tmpfile" 2>/dev/null) 294 295 if [[ "$result" != *"ESTABLISH_OK"* ]]; then 296 check_err 1 "connection failed after unblocking (rc=$rc): $result" 297 fi 298 299 local rehash_after 300 rehash_after=$(get_netstat_counter "$NS1" TcpTimeoutRehash) 301 if [ "$rehash_after" -le "$rehash_before" ]; then 302 check_err 1 "TcpTimeoutRehash counter did not increment" 303 fi 304 305 log_test "Local ECMP SYN rehash: establish with blocked paths" 306} 307 308# Block the server's return paths so SYN/ACKs are dropped. The client 309# retransmits SYNs at 1-second intervals; each duplicate SYN arriving at 310# the server triggers tcp_rtx_synack() which re-rolls txhash, so the 311# retransmitted SYN/ACK selects a different ECMP return path. 312test_ecmp_synack_rehash() 313{ 314 RET=0 315 local port 316 alloc_ports port 317 318 block_tcp "$NS2" veth0b 319 defer unblock_tcp "$NS2" veth0b 320 block_tcp "$NS2" veth1b 321 defer unblock_tcp "$NS2" veth1b 322 323 ip netns exec "$NS2" socat \ 324 "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr,fork" \ 325 EXEC:"echo SYNACK_OK" & 326 defer kill_process $! 327 328 wait_local_port_listen "$NS2" "$port" tcp 329 330 # Start the connection; SYNs reach the server (client egress is 331 # open) but SYN/ACKs are dropped on the server's return path. 332 local tmpfile 333 tmpfile=$(mktemp) 334 defer rm -f "$tmpfile" 335 336 ip netns exec "$NS1" socat -u \ 337 "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],connect-timeout=60" \ 338 STDOUT >"$tmpfile" 2>&1 & 339 local client_pid=$! 340 defer kill_process "$client_pid" 341 342 # Wait until both server-side interfaces have dropped at least 343 # one SYN/ACK, proving the server rehashed its return path. 344 slowwait 30 both_devs_attempted "$NS2" veth0b veth1b > /dev/null 345 check_err $? "SYN/ACKs did not appear on both return paths" 346 if [ "$RET" -ne 0 ]; then 347 log_test "Local ECMP SYN/ACK rehash: blocked return path" 348 return 349 fi 350 351 # Unblock and let the connection complete. 352 unblock_tcp "$NS2" veth0b 353 unblock_tcp "$NS2" veth1b 354 355 local rc=0 356 wait "$client_pid" || rc=$? 357 358 local result 359 result=$(cat "$tmpfile" 2>/dev/null) 360 361 if [[ "$result" != *"SYNACK_OK"* ]]; then 362 check_err 1 "connection failed after unblocking (rc=$rc): $result" 363 fi 364 365 log_test "Local ECMP SYN/ACK rehash: blocked return path" 366} 367 368# Establish a data transfer with both paths open, then block the 369# active path. Verify that data appears on the previously inactive 370# path (proving RTO triggered a rehash) and that TcpTimeoutRehash 371# incremented. 372# 373# With 2-way ECMP each rehash may pick the same path, so a single 374# attempt can occasionally fail. Retry once for robustness. 375 376# Single attempt at the midstream rehash check. Returns 0 on success. 377ecmp_midstream_rehash_attempt() 378{ 379 local port=$1; shift 380 local reason="" 381 382 ip netns exec "$NS2" socat -u \ 383 "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" - >/dev/null & 384 local server_pid=$! 385 386 wait_local_port_listen "$NS2" "$port" tcp 387 388 local base_tx0 base_tx1 389 base_tx0=$(link_tx_packets_get "$NS1" veth0a) 390 base_tx1=$(link_tx_packets_get "$NS1" veth1a) 391 392 # Continuous data source; timeout caps overall test duration and 393 # must exceed the slowwait below so data keeps flowing. 394 ip netns exec "$NS1" timeout 90 socat -u \ 395 OPEN:/dev/zero \ 396 "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1]" &>/dev/null & 397 local client_pid=$! 398 399 # Wait for enough packets to identify the active path. 400 if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \ 401 ">= $((base_tx0 + base_tx1 + 10))" \ 402 link_tx_packets_total "$NS1" > /dev/null; then 403 kill "$client_pid" "$server_pid" 2>/dev/null 404 wait "$client_pid" "$server_pid" 2>/dev/null 405 echo "no TX activity" 406 return 1 407 fi 408 409 # Find the active path and block it. 410 local current_tx0 current_tx1 active_idx inactive_idx 411 current_tx0=$(link_tx_packets_get "$NS1" veth0a) 412 current_tx1=$(link_tx_packets_get "$NS1" veth1a) 413 if [ $((current_tx0 - base_tx0)) -ge $((current_tx1 - base_tx1)) ]; then 414 active_idx=0; inactive_idx=1 415 else 416 active_idx=1; inactive_idx=0 417 fi 418 419 local rehash_before 420 rehash_before=$(get_netstat_counter "$NS1" TcpTimeoutRehash) 421 # Suppress __dst_negative_advice() in tcp_write_timeout() so 422 # that __sk_dst_reset() is the only dst-invalidation mechanism 423 # on the RTO path. 424 local saved_retries1 425 saved_retries1=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_retries1) 426 ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_retries1=255 427 428 block_tcp "$NS1" "veth${active_idx}a" 429 430 # Capture baseline after block_tcp returns. block_tcp adds a 431 # prio qdisc then a tc filter; between those two steps the 432 # qdisc's CAN_BYPASS fast-path lets packets through unfiltered. 433 local inactive_before 434 inactive_before=$(link_tx_packets_get "$NS1" "veth${inactive_idx}a") 435 436 # Wait for meaningful data on the previously inactive path, 437 # proving RTO triggered a rehash and data actually moved. 438 if ! slowwait 60 dev_tx_packets_above \ 439 "$NS1" "veth${inactive_idx}a" "$((inactive_before + 100))" \ 440 > /dev/null; then 441 reason="no data on alternate path" 442 fi 443 444 local rehash_after 445 rehash_after=$(get_netstat_counter "$NS1" TcpTimeoutRehash) 446 if [ "$rehash_after" -le "$rehash_before" ]; then 447 reason="${reason:+$reason; }TcpTimeoutRehash did not increment" 448 fi 449 450 unblock_tcp "$NS1" "veth${active_idx}a" 451 ip netns exec "$NS1" sysctl -qw \ 452 net.ipv4.tcp_retries1="$saved_retries1" 453 kill "$client_pid" "$server_pid" 2>/dev/null 454 wait "$client_pid" "$server_pid" 2>/dev/null 455 if [ -n "$reason" ]; then 456 echo "$reason" 457 return 1 458 fi 459 return 0 460} 461 462test_ecmp_midstream_rehash() 463{ 464 RET=0 465 local port retry_port 466 alloc_ports port 467 alloc_ports retry_port 468 469 local fail_reason 470 if ! ecmp_midstream_rehash_attempt "$port" >/dev/null; then 471 fail_reason=$(ecmp_midstream_rehash_attempt "$retry_port") 472 check_err $? "$fail_reason" 473 fi 474 475 log_test "Local ECMP midstream rehash: block active path" 476} 477 478# Single attempt at the ACK rehash check. Returns 0 on success. 479ecmp_ack_rehash_attempt() 480{ 481 local port=$1; shift 482 local reason="" 483 484 ip netns exec "$NS2" socat -u \ 485 "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" - >/dev/null & 486 local server_pid=$! 487 488 wait_local_port_listen "$NS2" "$port" tcp 489 490 local base_tx0 base_tx1 491 base_tx0=$(link_tx_packets_get "$NS2" veth0b) 492 base_tx1=$(link_tx_packets_get "$NS2" veth1b) 493 494 # Continuous data source from NS1 to NS2. Cap the send buffer 495 # so in-flight data stays below the receiver's advertised window. 496 # Without this, the sender can exhaust the receiver's window and 497 # enter persist mode (zero-window probing) instead of RTO when 498 # ACKs are blocked, and persist probes do not trigger flowlabel 499 # rehash. 500 ip netns exec "$NS1" timeout 120 socat -u \ 501 OPEN:/dev/zero \ 502 "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],sndbuf=16384" \ 503 &>/dev/null & 504 local client_pid=$! 505 506 # Wait for enough server TX (ACKs) to identify the active return path. 507 if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \ 508 ">= $((base_tx0 + base_tx1 + 10))" \ 509 link_tx_packets_total "$NS2" veth0b veth1b > /dev/null; then 510 kill "$client_pid" "$server_pid" 2>/dev/null 511 wait "$client_pid" "$server_pid" 2>/dev/null 512 echo "no server TX activity" 513 return 1 514 fi 515 516 local cur_tx0 cur_tx1 active_dev inactive_dev 517 cur_tx0=$(link_tx_packets_get "$NS2" veth0b) 518 cur_tx1=$(link_tx_packets_get "$NS2" veth1b) 519 if [ $((cur_tx0 - base_tx0)) -ge $((cur_tx1 - base_tx1)) ]; then 520 active_dev=veth0b; inactive_dev=veth1b 521 else 522 active_dev=veth1b; inactive_dev=veth0b 523 fi 524 525 local rehash_before 526 rehash_before=$(get_netstat_counter "$NS2" TcpDuplicateDataRehash) 527 528 # Block the inactive return path first (no effect on current 529 # ACK flow), then block the active path. This avoids counting 530 # normal ACK drops as rehash evidence. 531 block_tcp "$NS2" "$inactive_dev" 532 local inactive_before 533 inactive_before=$(tc_filter_pkt_count "$NS2" "$inactive_dev") 534 block_tcp "$NS2" "$active_dev" 535 536 # NS1 will RTO (no ACKs), retransmit with new flowlabel. 537 # NS2 detects the flowlabel change via tcp_rcv_spurious_retrans(), 538 # rehashes, and NS2's ACKs try the previously inactive return 539 # path. One successful rehash is sufficient. 540 if ! slowwait 60 until_counter_is \ 541 ">= $((${inactive_before:-0} + 1))" \ 542 tc_filter_pkt_count "$NS2" "$inactive_dev" > /dev/null; then 543 reason="no ACKs on alternate return path after blocking" 544 fi 545 546 local rehash_after 547 rehash_after=$(get_netstat_counter "$NS2" TcpDuplicateDataRehash) 548 if [ "$rehash_after" -le "$rehash_before" ]; then 549 reason="${reason:+$reason; }TcpDuplicateDataRehash did not increment" 550 fi 551 552 unblock_tcp "$NS2" "$active_dev" 553 unblock_tcp "$NS2" "$inactive_dev" 554 kill "$client_pid" "$server_pid" 2>/dev/null 555 wait "$client_pid" "$server_pid" 2>/dev/null 556 if [ -n "$reason" ]; then 557 echo "$reason" 558 return 1 559 fi 560 return 0 561} 562 563# Block the receiver's (NS2) ACK return paths while data flows from 564# NS1 to NS2. The sender (NS1) times out and retransmits with a new 565# flowlabel; the receiver detects the changed flowlabel via 566# tcp_rcv_spurious_retrans() and rehashes its own txhash so that its 567# ACKs try a different ECMP return path. 568# 569# With 2-way ECMP each rehash may pick the same path, so a single 570# attempt can occasionally fail. Retry once for robustness. 571test_ecmp_midstream_ack_rehash() 572{ 573 RET=0 574 local port retry_port 575 alloc_ports port 576 alloc_ports retry_port 577 578 local fail_reason 579 if ! ecmp_ack_rehash_attempt "$port" >/dev/null; then 580 fail_reason=$(ecmp_ack_rehash_attempt "$retry_port") 581 check_err $? "$fail_reason" 582 fi 583 584 log_test "Local ECMP midstream ACK rehash: blocked return path" 585} 586 587# Establish a DCTCP data transfer with PLB enabled, then ECN-mark both 588# paths. Sustained CE marking triggers PLB to call sk_rethink_txhash() 589# + __sk_dst_reset(), bouncing the connection between ECMP paths. 590# Verify data appears on both paths and that TCPPLBRehash incremented. 591test_ecmp_plb_rehash() 592{ 593 RET=0 594 local port 595 alloc_ports port 596 597 # PLB needs DCTCP, a restricted congestion control. Adding it to 598 # the host-global tcp_allowed_congestion_control would relax the 599 # restricted-CC policy for the whole host (there is no per-netns 600 # allowed set). Instead pin dctcp on the test routes with 601 # "congctl": the route's RTAX_CC_ALGO is honoured on both the 602 # connect and accept paths without the restricted-CC check, and a 603 # dctcp route also carries DST_FEATURE_ECN_CA so the server 604 # negotiates ECN -- all confined to the test namespaces. 605 local available 606 available=$(ip netns exec "$NS1" sysctl -n \ 607 net.ipv4.tcp_available_congestion_control) 608 if ! echo "$available" | grep -qw dctcp; then 609 log_test_skip "Local ECMP PLB rehash: DCTCP not available" 610 return "$ksft_skip" 611 fi 612 install_ecmp_routes change dctcp 613 defer install_ecmp_routes change 614 615 # Save NS1 sysctls before modifying them. 616 local saved_ecn1 saved_plb_enabled saved_plb_rounds 617 local saved_plb_thresh saved_plb_suspend 618 saved_ecn1=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_ecn) 619 saved_plb_enabled=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_plb_enabled) 620 saved_plb_rounds=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_plb_rehash_rounds) 621 saved_plb_thresh=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_plb_cong_thresh) 622 saved_plb_suspend=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_plb_suspend_rto_sec) 623 624 # Enable ECN and PLB on the sender; dctcp comes from the route. 625 ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_ecn=1 626 ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_enabled=1 627 ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_rehash_rounds=3 628 ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_cong_thresh=1 629 ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_suspend_rto_sec=0 630 defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_ecn="$saved_ecn1" 631 defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_enabled="$saved_plb_enabled" 632 defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_rehash_rounds="$saved_plb_rounds" 633 defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_cong_thresh="$saved_plb_thresh" 634 defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_suspend_rto_sec="$saved_plb_suspend" 635 636 ip netns exec "$NS2" socat -u \ 637 "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" - >/dev/null & 638 defer kill_process $! 639 640 wait_local_port_listen "$NS2" "$port" tcp 641 642 local base_tx0 base_tx1 643 base_tx0=$(link_tx_packets_get "$NS1" veth0a) 644 base_tx1=$(link_tx_packets_get "$NS1" veth1a) 645 646 ip netns exec "$NS1" timeout 90 socat -u \ 647 OPEN:/dev/zero \ 648 "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1]" &>/dev/null & 649 local client_pid=$! 650 defer kill_process "$client_pid" 651 652 # Wait for data to start flowing before applying ECN marking. 653 busywait "$BUSYWAIT_TIMEOUT" until_counter_is \ 654 ">= $((base_tx0 + base_tx1 + 10))" \ 655 link_tx_packets_total "$NS1" > /dev/null 656 check_err $? "no TX activity detected" 657 if [ "$RET" -ne 0 ]; then 658 log_test "Local ECMP PLB rehash: ECN-marked path" 659 return 660 fi 661 662 # Snapshot TX counters and rehash stats before ECN marking. 663 local pre_ecn_tx0 pre_ecn_tx1 664 pre_ecn_tx0=$(link_tx_packets_get "$NS1" veth0a) 665 pre_ecn_tx1=$(link_tx_packets_get "$NS1" veth1a) 666 667 local plb_before rto_before 668 plb_before=$(get_netstat_counter "$NS1" TCPPLBRehash) 669 rto_before=$(get_netstat_counter "$NS1" TcpTimeoutRehash) 670 671 # CE-mark all data on both paths. PLB detects sustained 672 # congestion and rehashes, bouncing traffic between paths. 673 mark_ecn "$NS1" veth0a 674 defer unblock_tcp "$NS1" veth0a # removes the marking rule 675 mark_ecn "$NS1" veth1a 676 defer unblock_tcp "$NS1" veth1a # removes the marking rule 677 678 # Wait for meaningful data on both paths, proving PLB rehashed 679 # the connection and traffic actually moved. Require at least 680 # 100 packets beyond the baseline to rule out stray control 681 # packets (ND, etc.) satisfying the check. 682 slowwait 60 dev_tx_packets_above \ 683 "$NS1" veth0a "$((pre_ecn_tx0 + 100))" > /dev/null 684 check_err $? "no data on veth0a after ECN marking" 685 686 slowwait 60 dev_tx_packets_above \ 687 "$NS1" veth1a "$((pre_ecn_tx1 + 100))" > /dev/null 688 check_err $? "no data on veth1a after ECN marking" 689 690 local plb_after rto_after 691 plb_after=$(get_netstat_counter "$NS1" TCPPLBRehash) 692 rto_after=$(get_netstat_counter "$NS1" TcpTimeoutRehash) 693 if [ "$plb_after" -le "$plb_before" ]; then 694 check_err 1 "TCPPLBRehash counter did not increment" 695 fi 696 if [ "$rto_after" -gt "$rto_before" ]; then 697 check_err 1 "TcpTimeoutRehash incremented; rehash was RTO-driven, not PLB" 698 fi 699 700 log_test "Local ECMP PLB rehash: ECN-marked path" 701} 702 703# Verify that hash policy 1 (L3+L4 symmetric) preserves the ECMP path 704# across rehash. Policy 1 computes a deterministic hash from the 705# 5-tuple, so mp_hash stays 0 and rt6_multipath_hash() always selects 706# the same path regardless of txhash changes. 707test_ecmp_hash_policy1_no_rehash() 708{ 709 RET=0 710 local port 711 alloc_ports port 712 713 local saved_policy 714 saved_policy=$(ip netns exec "$NS1" sysctl -n \ 715 net.ipv6.fib_multipath_hash_policy) 716 ip netns exec "$NS1" sysctl -qw net.ipv6.fib_multipath_hash_policy=1 717 defer ip netns exec "$NS1" sysctl -qw \ 718 net.ipv6.fib_multipath_hash_policy="$saved_policy" 719 720 block_tcp "$NS1" veth0a 721 defer unblock_tcp "$NS1" veth0a 722 block_tcp "$NS1" veth1a 723 defer unblock_tcp "$NS1" veth1a 724 725 ip netns exec "$NS2" socat \ 726 "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr,fork" \ 727 EXEC:"echo POLICY1_OK" & 728 defer kill_process $! 729 730 wait_local_port_listen "$NS2" "$port" tcp 731 732 local rehash_before 733 rehash_before=$(get_netstat_counter "$NS1" TcpTimeoutRehash) 734 735 ip netns exec "$NS1" timeout 10 socat -u \ 736 "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],connect-timeout=8" \ 737 STDOUT >/dev/null 2>&1 & 738 local client_pid=$! 739 defer kill_process "$client_pid" 740 741 # With policy 1, the deterministic 5-tuple hash always selects 742 # the same path. Wait for multiple SYN retransmits (proving 743 # rehash was attempted), then verify all SYNs landed on the 744 # same interface. 745 local rehash_after 746 slowwait 8 until_counter_is ">= $((rehash_before + 3))" \ 747 get_netstat_counter "$NS1" TcpTimeoutRehash > /dev/null 748 rehash_after=$(get_netstat_counter "$NS1" TcpTimeoutRehash) 749 if [ "$rehash_after" -le "$rehash_before" ]; then 750 check_err 1 "TcpTimeoutRehash counter did not increment" 751 fi 752 753 local c0 c1 754 c0=$(tc_filter_pkt_count "$NS1" veth0a) 755 c1=$(tc_filter_pkt_count "$NS1" veth1a) 756 if [ "${c0:-0}" -ge 1 ] && [ "${c1:-0}" -ge 1 ]; then 757 check_err 1 "SYNs appeared on both paths despite policy 1" 758 fi 759 if [ "${c0:-0}" -eq 0 ] && [ "${c1:-0}" -eq 0 ]; then 760 check_err 1 "no SYNs observed on either path" 761 fi 762 763 log_test "Local ECMP policy 1: no path change on rehash" 764} 765 766# Verify that mp_hash does not leak into the on-wire flowlabel. 767# With auto_flowlabels=0, the wire flowlabel must be 0. Install tc 768# filters that pass TCP with flowlabel=0 but drop TCP with nonzero 769# flowlabel, then establish a connection and transfer data. If 770# mp_hash leaked into fl6->flowlabel, the SYN or data packets would 771# be dropped and the connection would fail. 772test_ecmp_no_flowlabel_leak() 773{ 774 RET=0 775 local port 776 alloc_ports port 777 778 local saved_afl 779 saved_afl=$(ip netns exec "$NS1" sysctl -n \ 780 net.ipv6.auto_flowlabels) 781 ip netns exec "$NS1" sysctl -qw net.ipv6.auto_flowlabels=0 782 defer ip netns exec "$NS1" sysctl -qw \ 783 net.ipv6.auto_flowlabels="$saved_afl" 784 785 # On both egress interfaces: pass TCP with flowlabel=0 (prio 1), 786 # drop any remaining TCP (nonzero flowlabel, prio 2). ICMPv6 787 # matches neither filter and passes through normally. 788 local dev 789 for dev in veth0a veth1a; do 790 ip netns exec "$NS1" tc qdisc add dev "$dev" \ 791 root handle 1: prio 792 ip netns exec "$NS1" tc filter add dev "$dev" parent 1: \ 793 protocol ipv6 prio 1 u32 \ 794 match u32 0x00000000 0x000FFFFF at 0 \ 795 match u8 0x06 0xff at 6 \ 796 action ok 797 ip netns exec "$NS1" tc filter add dev "$dev" parent 1: \ 798 protocol ipv6 prio 2 u32 \ 799 match u8 0x06 0xff at 6 \ 800 action drop 801 defer unblock_tcp "$NS1" "$dev" 802 done 803 804 ip netns exec "$NS2" socat \ 805 "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" \ 806 EXEC:"echo FLOWLABEL_OK" & 807 defer kill_process $! 808 809 wait_local_port_listen "$NS2" "$port" tcp 810 811 local tmpfile 812 tmpfile=$(mktemp) 813 defer rm -f "$tmpfile" 814 815 ip netns exec "$NS1" socat -u \ 816 "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],connect-timeout=10" \ 817 STDOUT >"$tmpfile" 2>&1 818 819 local result 820 result=$(cat "$tmpfile" 2>/dev/null) 821 if [[ "$result" != *"FLOWLABEL_OK"* ]]; then 822 check_err 1 "connection failed: mp_hash may have leaked into wire flowlabel" 823 fi 824 825 log_test "No flowlabel leak with auto_flowlabels=0" 826} 827 828# Helper: stream data, invalidate the cached dst by adding and 829# removing a dummy route (bumps fib6_node sernum), then check that 830# traffic stays on the same ECMP path. Used by both the normal 831# tcp_v6_connect and syncookie variants. 832ecmp_dst_rebuild_check() 833{ 834 local ns_client=$1; shift 835 local port=$1; shift 836 local rc=0 837 838 # Suppress __dst_negative_advice() during the test so that a 839 # real TCP timeout cannot trigger an additional dst 840 # invalidation via a different code path. 841 local saved_retries1 842 saved_retries1=$(ip netns exec "$ns_client" sysctl -n \ 843 net.ipv4.tcp_retries1) 844 ip netns exec "$ns_client" sysctl -qw net.ipv4.tcp_retries1=255 845 846 local base0 base1 847 base0=$(link_tx_packets_get "$ns_client" veth0a) 848 base1=$(link_tx_packets_get "$ns_client" veth1a) 849 850 ip netns exec "$ns_client" timeout 15 socat -u \ 851 OPEN:/dev/zero \ 852 "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1]" \ 853 &>/dev/null & 854 local client_pid=$! 855 856 # Wait for enough packets to identify the active path. 857 # Return 2 for setup failure (distinct from 1 = path changed). 858 if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \ 859 ">= $((base0 + base1 + 50))" \ 860 link_tx_packets_total "$ns_client" > /dev/null; then 861 ip netns exec "$ns_client" sysctl -qw \ 862 net.ipv4.tcp_retries1="$saved_retries1" 863 kill "$client_pid" 2>/dev/null 864 wait "$client_pid" 2>/dev/null 865 return 2 866 fi 867 868 local mid0 mid1 active_dev inactive_dev 869 mid0=$(link_tx_packets_get "$ns_client" veth0a) 870 mid1=$(link_tx_packets_get "$ns_client" veth1a) 871 if [ $((mid0 - base0)) -ge $((mid1 - base1)) ]; then 872 active_dev=veth0a; inactive_dev=veth1a 873 else 874 active_dev=veth1a; inactive_dev=veth0a 875 fi 876 877 local active_before inactive_before 878 active_before=$(link_tx_packets_get "$ns_client" "$active_dev") 879 inactive_before=$(link_tx_packets_get "$ns_client" "$inactive_dev") 880 881 # Invalidate the cached dst by bumping the fib6_node sernum. 882 # Adding and removing a high-metric dummy route achieves this 883 # without touching the ECMP nexthops, avoiding a transient 884 # single-nexthop state during multipath route replace. 885 ip -n "$ns_client" -6 route add fd00:ff::2/128 dev lo metric 9999 886 ip -n "$ns_client" -6 route del fd00:ff::2/128 dev lo metric 9999 887 888 # Wait for enough post-rebuild traffic to detect a path change. 889 if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \ 890 ">= $((active_before + inactive_before + 50))" \ 891 link_tx_packets_total "$ns_client" > /dev/null; then 892 ip netns exec "$ns_client" sysctl -qw \ 893 net.ipv4.tcp_retries1="$saved_retries1" 894 kill "$client_pid" 2>/dev/null 895 wait "$client_pid" 2>/dev/null 896 return 2 897 fi 898 899 local active_after inactive_after 900 active_after=$(link_tx_packets_get "$ns_client" "$active_dev") 901 inactive_after=$(link_tx_packets_get "$ns_client" "$inactive_dev") 902 903 local active_delta=$((active_after - active_before)) 904 local inactive_delta=$((inactive_after - inactive_before)) 905 906 if [ "$inactive_delta" -gt "$active_delta" ]; then 907 rc=1 908 fi 909 910 ip netns exec "$ns_client" sysctl -qw \ 911 net.ipv4.tcp_retries1="$saved_retries1" 912 kill "$client_pid" 2>/dev/null 913 wait "$client_pid" 2>/dev/null 914 return "$rc" 915} 916 917# Run ecmp_dst_rebuild_check for ECMP_REBUILD_ROUNDS rounds, each with 918# a fresh server and connection. With a correct kernel the path is 919# deterministic (same txhash always selects the same ECMP nexthop), 920# so any path change is a bug. Multiple rounds catch a buggy kernel 921# that picks a random path: each round has 50% chance of accidentally 922# matching, so 10 rounds gives < 0.1% false-pass probability. 923ecmp_dst_rebuild_loop() 924{ 925 local base_port=$1; shift 926 local label=$1; shift 927 local path_changes=0 928 local r 929 930 for r in $(seq 1 "$ECMP_REBUILD_ROUNDS"); do 931 local port=$((base_port + r - 1)) 932 933 ip netns exec "$NS2" socat -u \ 934 "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" \ 935 - >/dev/null & 936 local server_pid=$! 937 938 wait_local_port_listen "$NS2" "$port" tcp 939 940 local check_rc=0 941 ecmp_dst_rebuild_check "$NS1" "$port" || check_rc=$? 942 943 kill "$server_pid" 2>/dev/null 944 wait "$server_pid" 2>/dev/null 945 946 busywait "$BUSYWAIT_TIMEOUT" \ 947 port_has_no_active_tcp "$NS1" "$port" > /dev/null 948 busywait "$BUSYWAIT_TIMEOUT" \ 949 port_has_no_active_tcp "$NS2" "$port" > /dev/null 950 951 if [ "$check_rc" -eq 2 ]; then 952 check_err 1 "no TX activity in round $r" 953 break 954 elif [ "$check_rc" -eq 1 ]; then 955 path_changes=$((path_changes + 1)) 956 fi 957 done 958 959 if [ "$path_changes" -gt 0 ]; then 960 check_err 1 "$path_changes/$ECMP_REBUILD_ROUNDS changed path" 961 fi 962 963 log_test "$label" 964} 965 966# Verify that a dst invalidation does not cause the connection to 967# switch ECMP paths. With the fix, both the initial route lookup 968# (tcp_v6_connect) and subsequent rebuilds (inet6_csk_route_socket) 969# use sk_txhash >> 1, so the path is stable. 970test_ecmp_dst_rebuild_consistency() 971{ 972 RET=0 973 local base_port 974 alloc_ports base_port "$ECMP_REBUILD_ROUNDS" 975 976 ecmp_dst_rebuild_loop "$base_port" \ 977 "ECMP path stable after dst invalidation" 978} 979 980# Return 0 (true) when no active TCP sockets remain on a port. 981# TIME_WAIT is excluded because it does not generate outgoing traffic. 982port_has_no_active_tcp() 983{ 984 local ns=$1; shift 985 local port=$1; shift 986 987 ! ip netns exec "$ns" ss -tnH \ 988 state established \ 989 state fin-wait-1 \ 990 state fin-wait-2 \ 991 state close-wait \ 992 state last-ack \ 993 state closing \ 994 state syn-sent \ 995 state syn-recv \ 996 "sport = :$port or dport = :$port" | grep -q . 997} 998 999# Count TCP packets on server egress without blocking them. 1000# Uses tc filters with "action ok" so packets are counted and passed. 1001count_tcp() 1002{ 1003 local ns=$1; shift 1004 local dev=$1; shift 1005 1006 ip netns exec "$ns" tc qdisc add dev "$dev" root handle 1: prio 1007 ip netns exec "$ns" tc filter add dev "$dev" parent 1: \ 1008 protocol ipv6 prio 1 u32 match u8 0x06 0xff at 6 action ok 1009} 1010 1011# Verify that the server's SYN-ACK (sent from the request socket) and 1012# subsequent ACKs (sent from the full socket created in cookie_v6_check) 1013# use the same ECMP path. With syncookies the request socket is freed 1014# after the SYN-ACK and a new one is created during cookie validation; 1015# this test catches the case where the two request sockets pick 1016# different ECMP paths due to independent txhash values. 1017test_ecmp_syncookie_path_consistency() 1018{ 1019 RET=0 1020 1021 local saved_syncookies 1022 saved_syncookies=$(ip netns exec "$NS2" sysctl -n \ 1023 net.ipv4.tcp_syncookies 2>/dev/null) 1024 if [ -z "$saved_syncookies" ]; then 1025 log_test_skip "Syncookie server ECMP path consistent" 1026 return "$ksft_skip" 1027 fi 1028 ip netns exec "$NS2" sysctl -qw net.ipv4.tcp_syncookies=2 1029 defer ip netns exec "$NS2" sysctl -qw \ 1030 net.ipv4.tcp_syncookies="$saved_syncookies" 1031 1032 count_tcp "$NS2" veth0b 1033 defer unblock_tcp "$NS2" veth0b 1034 count_tcp "$NS2" veth1b 1035 defer unblock_tcp "$NS2" veth1b 1036 1037 local path_splits=0 1038 local r base_port 1039 alloc_ports base_port "$ECMP_REBUILD_ROUNDS" 1040 1041 for r in $(seq 1 "$ECMP_REBUILD_ROUNDS"); do 1042 local port=$((base_port + r - 1)) 1043 1044 ip netns exec "$NS2" socat -u \ 1045 "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" \ 1046 - >/dev/null & 1047 local server_pid=$! 1048 1049 wait_local_port_listen "$NS2" "$port" tcp 1050 1051 local srv_base0 srv_base1 1052 srv_base0=$(tc_filter_pkt_count "$NS2" veth0b) 1053 srv_base1=$(tc_filter_pkt_count "$NS2" veth1b) 1054 1055 ip netns exec "$NS1" timeout 5 socat -u \ 1056 OPEN:/dev/zero \ 1057 "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1]" \ 1058 &>/dev/null & 1059 local client_pid=$! 1060 1061 local cli_base 1062 cli_base=$(link_tx_packets_total "$NS1") 1063 if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \ 1064 ">= $((cli_base + 200))" \ 1065 link_tx_packets_total "$NS1" > /dev/null; then 1066 check_err 1 "no TX activity in round $r" 1067 kill "$client_pid" 2>/dev/null 1068 wait "$client_pid" 2>/dev/null 1069 kill "$server_pid" 2>/dev/null 1070 wait "$server_pid" 2>/dev/null 1071 break 1072 fi 1073 1074 local srv_tcp0 srv_tcp1 1075 srv_tcp0=$(tc_filter_pkt_count "$NS2" veth0b) 1076 srv_tcp1=$(tc_filter_pkt_count "$NS2" veth1b) 1077 local srv_delta0=$(( ${srv_tcp0:-0} - ${srv_base0:-0} )) 1078 local srv_delta1=$(( ${srv_tcp1:-0} - ${srv_base1:-0} )) 1079 1080 if [ "$srv_delta0" -gt 0 ] && [ "$srv_delta1" -gt 0 ]; then 1081 path_splits=$((path_splits + 1)) 1082 fi 1083 1084 kill "$client_pid" 2>/dev/null 1085 wait "$client_pid" 2>/dev/null 1086 kill "$server_pid" 2>/dev/null 1087 wait "$server_pid" 2>/dev/null 1088 1089 # Wait for TCP teardown packets (FIN/RST) to finish so 1090 # they do not pollute the next round's tc filter counters. 1091 busywait "$BUSYWAIT_TIMEOUT" \ 1092 port_has_no_active_tcp "$NS1" "$port" > /dev/null 1093 busywait "$BUSYWAIT_TIMEOUT" \ 1094 port_has_no_active_tcp "$NS2" "$port" > /dev/null 1095 done 1096 1097 if [ "$path_splits" -gt 0 ]; then 1098 check_err 1 "$path_splits/$ECMP_REBUILD_ROUNDS had split server path" 1099 fi 1100 1101 log_test "Syncookie server ECMP path consistent" 1102} 1103 1104require_command socat 1105 1106trap 'defer_scopes_cleanup; cleanup_all_ns' EXIT 1107setup || exit $? 1108tests_run 1109exit "$EXIT_STATUS" 1110