xref: /linux/tools/testing/selftests/net/ecmp_rehash.sh (revision bc5c25c8f684982d0363380e3490f626c68e0427)
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# Test local ECMP path re-selection on TCP retransmission timeout and PLB.
5#
6# Two namespaces connected by two parallel veth pairs with a 2-way ECMP
7# route.  When a TCP path is blocked (via tc drop) or congested (via
8# netem ECN marking), the kernel rehashes the connection via
9# sk_rethink_txhash() + __sk_dst_reset(), causing the next route lookup
10# to select the other ECMP path.
11#
12# Expected runtime: ~60 seconds.  Most time is spent waiting for TCP
13# retransmission timeouts (1-7s per test) and running multi-round
14# consistency checks (10 rounds each).  The large slowwait/connect-timeout
15# values (30-120s) are worst-case bounds for CI; a correctly functioning
16# kernel reaches each check well before the timeout expires.
17
18source lib.sh
19
20SUBNETS=(a b)
21PORT=9900
22: "${ECMP_REBUILD_ROUNDS:=10}"
23
24# alloc_ports NAME [COUNT]: set NAME to the next free port and reserve
25# COUNT ports (default 1) from a shared counter.  Each test allocates its
26# own port(s) where it runs, so a retry or a newly added test never
27# collides; the per-round tests reserve ECMP_REBUILD_ROUNDS each.
28NEXT_PORT=$PORT
29alloc_ports()
30{
31	printf -v "$1" '%d' "$NEXT_PORT"
32	NEXT_PORT=$((NEXT_PORT + ${2:-1}))
33}
34
35ALL_TESTS="
36	test_ecmp_syn_rehash
37	test_ecmp_synack_rehash
38	test_ecmp_midstream_rehash
39	test_ecmp_midstream_ack_rehash
40	test_ecmp_plb_rehash
41	test_ecmp_hash_policy1_no_rehash
42	test_ecmp_no_flowlabel_leak
43	test_ecmp_dst_rebuild_consistency
44	test_ecmp_syncookie_path_consistency
45"
46
47link_tx_packets_get()
48{
49	local ns=$1; shift
50	local dev=$1; shift
51
52	ip netns exec "$ns" cat "/sys/class/net/$dev/statistics/tx_packets"
53}
54
55# Return the number of packets matched by the tc filter action on a device.
56# When tc drops packets via "action drop", the device's tx_packets is not
57# incremented (packet never reaches veth_xmit), but the tc action maintains
58# its own counter.
59tc_filter_pkt_count()
60{
61	local ns=$1; shift
62	local dev=$1; shift
63
64	ip netns exec "$ns" tc -s filter show dev "$dev" parent 1: 2>/dev/null |
65		awk '/Sent .* pkt/ {
66			for (i=1; i<=NF; i++)
67				if ($i == "pkt") { print $(i-1); exit }
68		}'
69}
70
71# Read a TcpExt counter from /proc/net/netstat in a namespace.
72# Returns 0 if the counter is not found.
73get_netstat_counter()
74{
75	local ns=$1; shift
76	local field=$1; shift
77	local val
78
79	# shellcheck disable=SC2016
80	val=$(ip netns exec "$ns" awk -v key="$field" '
81		/^TcpExt:/ {
82			if (!h) { split($0, n); h=1 }
83			else {
84				split($0, v)
85				for (i in n)
86					if (n[i] == key) print v[i]
87			}
88		}
89	' /proc/net/netstat)
90	echo "${val:-0}"
91}
92
93# Apply netem ECN marking: CE-mark all ECT packets instead of dropping them.
94mark_ecn()
95{
96	local ns=$1; shift
97	local dev=$1; shift
98
99	ip netns exec "$ns" tc qdisc add dev "$dev" root netem loss 100% ecn
100}
101
102# Block TCP (IPv6 next-header = 6) egress, allowing ICMPv6 through.
103block_tcp()
104{
105	local ns=$1; shift
106	local dev=$1; shift
107
108	ip netns exec "$ns" tc qdisc add dev "$dev" root handle 1: prio
109	ip netns exec "$ns" tc filter add dev "$dev" parent 1: \
110		protocol ipv6 prio 1 u32 match u8 0x06 0xff at 6 action drop
111}
112
113unblock_tcp()
114{
115	local ns=$1; shift
116	local dev=$1; shift
117
118	ip netns exec "$ns" tc qdisc del dev "$dev" root 2>/dev/null
119}
120
121# Return success when a device's TX counter exceeds a baseline value.
122dev_tx_packets_above()
123{
124	local ns=$1; shift
125	local dev=$1; shift
126	local baseline=$1; shift
127
128	local cur
129	cur=$(link_tx_packets_get "$ns" "$dev")
130	[ "$cur" -gt "$baseline" ]
131}
132
133# Return success when both devices have dropped at least one TCP packet.
134both_devs_attempted()
135{
136	local ns=$1; shift
137	local dev0=$1; shift
138	local dev1=$1; shift
139
140	local c0 c1
141	c0=$(tc_filter_pkt_count "$ns" "$dev0")
142	c1=$(tc_filter_pkt_count "$ns" "$dev1")
143	[ "${c0:-0}" -ge 1 ] && [ "${c1:-0}" -ge 1 ]
144}
145
146link_tx_packets_total()
147{
148	local ns=$1; shift
149	local dev0=${1:-veth0a}; shift 2>/dev/null
150	local dev1=${1:-veth1a}
151
152	echo $(( $(link_tx_packets_get "$ns" "$dev0") +
153		 $(link_tx_packets_get "$ns" "$dev1") ))
154}
155
156# (Re)install the ECMP multipath routes between NS1 and NS2.  $1 is the
157# ip route operation ("add" to create, "change" to replace).  If $2 is
158# given it names a congestion control to pin on both routes via "congctl";
159# because dctcp carries TCP_CONG_NEEDS_ECN, this also tags the route with
160# DST_FEATURE_ECN_CA, which makes the server negotiate ECN without the
161# listener itself having to run dctcp.  The nexthop topology lives here
162# only, so a test can re-pin the routes and restore them with one call.
163install_ecmp_routes()
164{
165	local op=$1 cc=$2
166	local -a cc_attr=()
167
168	[ -n "$cc" ] && cc_attr=(congctl "$cc")
169
170	ip -n "$NS1" -6 route "$op" fd00:ff::2/128 "${cc_attr[@]}" \
171		nexthop via fd00:a::2 dev veth0a \
172		nexthop via fd00:b::2 dev veth1a
173
174	ip -n "$NS2" -6 route "$op" fd00:ff::1/128 "${cc_attr[@]}" \
175		nexthop via fd00:a::1 dev veth0b \
176		nexthop via fd00:b::1 dev veth1b
177}
178
179setup()
180{
181	setup_ns NS1 NS2
182
183	local ns
184	for ns in "$NS1" "$NS2"; do
185		ip netns exec "$ns" sysctl -qw net.ipv6.conf.all.accept_dad=0
186		ip netns exec "$ns" sysctl -qw net.ipv6.conf.default.accept_dad=0
187		ip netns exec "$ns" sysctl -qw net.ipv6.conf.all.forwarding=1
188		ip netns exec "$ns" sysctl -qw net.core.txrehash=1
189	done
190
191	local i sub
192	for i in 0 1; do
193		sub=${SUBNETS[$i]}
194		ip link add "veth${i}a" type veth peer name "veth${i}b"
195		ip link set "veth${i}a" netns "$NS1"
196		ip link set "veth${i}b" netns "$NS2"
197		ip -n "$NS1" addr add "fd00:${sub}::1/64" dev "veth${i}a"
198		ip -n "$NS2" addr add "fd00:${sub}::2/64" dev "veth${i}b"
199		ip -n "$NS1" link set "veth${i}a" up
200		ip -n "$NS2" link set "veth${i}b" up
201	done
202
203	ip -n "$NS1" addr add fd00:ff::1/128 dev lo
204	ip -n "$NS2" addr add fd00:ff::2/128 dev lo
205
206	# Allow many SYN retries at 1-second intervals (linear, no
207	# exponential backoff) so the rehash test has enough attempts
208	# to exercise both ECMP paths.
209	if ! ip netns exec "$NS1" sysctl -qw \
210	     net.ipv4.tcp_syn_linear_timeouts=25; then
211		echo "SKIP: tcp_syn_linear_timeouts not supported"
212		return "$ksft_skip"
213	fi
214	ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_syn_retries=25
215
216	# Keep the server's request socket alive during the blocking
217	# period so SYN/ACK retransmits continue.
218	ip netns exec "$NS2" sysctl -qw net.ipv4.tcp_synack_retries=25
219
220	install_ecmp_routes add
221
222	for i in 0 1; do
223		sub=${SUBNETS[$i]}
224		ip netns exec "$NS1" \
225			ping -6 -c1 -W5 "fd00:${sub}::2" &>/dev/null
226		ip netns exec "$NS2" \
227			ping -6 -c1 -W5 "fd00:${sub}::1" &>/dev/null
228	done
229
230	if ! ip netns exec "$NS1" ping -6 -c1 -W5 fd00:ff::2 &>/dev/null; then
231		echo "Basic connectivity check failed"
232		return "$ksft_skip"
233	fi
234}
235
236# Block ALL paths, start a connection, wait until SYNs have been dropped
237# on both interfaces (proving rehash steered the SYN to a new path), then
238# unblock so the connection completes.
239test_ecmp_syn_rehash()
240{
241	RET=0
242	local port
243	alloc_ports port
244
245	block_tcp "$NS1" veth0a
246	defer unblock_tcp "$NS1" veth0a
247	block_tcp "$NS1" veth1a
248	defer unblock_tcp "$NS1" veth1a
249
250	ip netns exec "$NS2" socat \
251		"TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr,fork" \
252		EXEC:"echo ESTABLISH_OK" &
253	defer kill_process $!
254
255	wait_local_port_listen "$NS2" "$port" tcp
256
257	local rehash_before
258	rehash_before=$(get_netstat_counter "$NS1" TcpTimeoutRehash)
259
260	# Start the connection in the background; it will retry SYNs at
261	# 1-second intervals until an unblocked path is found.
262	# Use -u (unidirectional) to only receive from the server;
263	# sending data back would risk SIGPIPE if the server's EXEC
264	# child has already exited.
265	local tmpfile
266	tmpfile=$(mktemp)
267	defer rm -f "$tmpfile"
268
269	ip netns exec "$NS1" socat -u \
270		"TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],connect-timeout=60" \
271		STDOUT >"$tmpfile" 2>&1 &
272	local client_pid=$!
273	defer kill_process "$client_pid"
274
275	# Wait until both paths have seen at least one dropped SYN.
276	# This proves sk_rethink_txhash() rehashed the connection from
277	# one ECMP path to the other.
278	slowwait 30 both_devs_attempted "$NS1" veth0a veth1a > /dev/null
279	check_err $? "SYNs did not appear on both paths (rehash not working)"
280	if [ "$RET" -ne 0 ]; then
281		log_test "Local ECMP SYN rehash: establish with blocked paths"
282		return
283	fi
284
285	# Unblock both paths and let the next SYN retransmit succeed.
286	unblock_tcp "$NS1" veth0a
287	unblock_tcp "$NS1" veth1a
288
289	local rc=0
290	wait "$client_pid" || rc=$?
291
292	local result
293	result=$(cat "$tmpfile" 2>/dev/null)
294
295	if [[ "$result" != *"ESTABLISH_OK"* ]]; then
296		check_err 1 "connection failed after unblocking (rc=$rc): $result"
297	fi
298
299	local rehash_after
300	rehash_after=$(get_netstat_counter "$NS1" TcpTimeoutRehash)
301	if [ "$rehash_after" -le "$rehash_before" ]; then
302		check_err 1 "TcpTimeoutRehash counter did not increment"
303	fi
304
305	log_test "Local ECMP SYN rehash: establish with blocked paths"
306}
307
308# Block the server's return paths so SYN/ACKs are dropped.  The client
309# retransmits SYNs at 1-second intervals; each duplicate SYN arriving at
310# the server triggers tcp_rtx_synack() which re-rolls txhash, so the
311# retransmitted SYN/ACK selects a different ECMP return path.
312test_ecmp_synack_rehash()
313{
314	RET=0
315	local port
316	alloc_ports port
317
318	block_tcp "$NS2" veth0b
319	defer unblock_tcp "$NS2" veth0b
320	block_tcp "$NS2" veth1b
321	defer unblock_tcp "$NS2" veth1b
322
323	ip netns exec "$NS2" socat \
324		"TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr,fork" \
325		EXEC:"echo SYNACK_OK" &
326	defer kill_process $!
327
328	wait_local_port_listen "$NS2" "$port" tcp
329
330	# Start the connection; SYNs reach the server (client egress is
331	# open) but SYN/ACKs are dropped on the server's return path.
332	local tmpfile
333	tmpfile=$(mktemp)
334	defer rm -f "$tmpfile"
335
336	ip netns exec "$NS1" socat -u \
337		"TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],connect-timeout=60" \
338		STDOUT >"$tmpfile" 2>&1 &
339	local client_pid=$!
340	defer kill_process "$client_pid"
341
342	# Wait until both server-side interfaces have dropped at least
343	# one SYN/ACK, proving the server rehashed its return path.
344	slowwait 30 both_devs_attempted "$NS2" veth0b veth1b > /dev/null
345	check_err $? "SYN/ACKs did not appear on both return paths"
346	if [ "$RET" -ne 0 ]; then
347		log_test "Local ECMP SYN/ACK rehash: blocked return path"
348		return
349	fi
350
351	# Unblock and let the connection complete.
352	unblock_tcp "$NS2" veth0b
353	unblock_tcp "$NS2" veth1b
354
355	local rc=0
356	wait "$client_pid" || rc=$?
357
358	local result
359	result=$(cat "$tmpfile" 2>/dev/null)
360
361	if [[ "$result" != *"SYNACK_OK"* ]]; then
362		check_err 1 "connection failed after unblocking (rc=$rc): $result"
363	fi
364
365	log_test "Local ECMP SYN/ACK rehash: blocked return path"
366}
367
368# Establish a data transfer with both paths open, then block the
369# active path.  Verify that data appears on the previously inactive
370# path (proving RTO triggered a rehash) and that TcpTimeoutRehash
371# incremented.
372#
373# With 2-way ECMP each rehash may pick the same path, so a single
374# attempt can occasionally fail.  Retry once for robustness.
375
376# Single attempt at the midstream rehash check.  Returns 0 on success.
377ecmp_midstream_rehash_attempt()
378{
379	local port=$1; shift
380	local reason=""
381
382	ip netns exec "$NS2" socat -u \
383		"TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" - >/dev/null &
384	local server_pid=$!
385
386	wait_local_port_listen "$NS2" "$port" tcp
387
388	local base_tx0 base_tx1
389	base_tx0=$(link_tx_packets_get "$NS1" veth0a)
390	base_tx1=$(link_tx_packets_get "$NS1" veth1a)
391
392	# Continuous data source; timeout caps overall test duration and
393	# must exceed the slowwait below so data keeps flowing.
394	ip netns exec "$NS1" timeout 90 socat -u \
395		OPEN:/dev/zero \
396		"TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1]" &>/dev/null &
397	local client_pid=$!
398
399	# Wait for enough packets to identify the active path.
400	if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \
401			">= $((base_tx0 + base_tx1 + 10))" \
402		link_tx_packets_total "$NS1" > /dev/null; then
403		kill "$client_pid" "$server_pid" 2>/dev/null
404		wait "$client_pid" "$server_pid" 2>/dev/null
405		echo "no TX activity"
406		return 1
407	fi
408
409	# Find the active path and block it.
410	local current_tx0 current_tx1 active_idx inactive_idx
411	current_tx0=$(link_tx_packets_get "$NS1" veth0a)
412	current_tx1=$(link_tx_packets_get "$NS1" veth1a)
413	if [ $((current_tx0 - base_tx0)) -ge $((current_tx1 - base_tx1)) ]; then
414		active_idx=0; inactive_idx=1
415	else
416		active_idx=1; inactive_idx=0
417	fi
418
419	local rehash_before
420	rehash_before=$(get_netstat_counter "$NS1" TcpTimeoutRehash)
421	# Suppress __dst_negative_advice() in tcp_write_timeout() so
422	# that __sk_dst_reset() is the only dst-invalidation mechanism
423	# on the RTO path.
424	local saved_retries1
425	saved_retries1=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_retries1)
426	ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_retries1=255
427
428	block_tcp "$NS1" "veth${active_idx}a"
429
430	# Capture baseline after block_tcp returns.  block_tcp adds a
431	# prio qdisc then a tc filter; between those two steps the
432	# qdisc's CAN_BYPASS fast-path lets packets through unfiltered.
433	local inactive_before
434	inactive_before=$(link_tx_packets_get "$NS1" "veth${inactive_idx}a")
435
436	# Wait for meaningful data on the previously inactive path,
437	# proving RTO triggered a rehash and data actually moved.
438	if ! slowwait 60 dev_tx_packets_above \
439		"$NS1" "veth${inactive_idx}a" "$((inactive_before + 100))" \
440		> /dev/null; then
441		reason="no data on alternate path"
442	fi
443
444	local rehash_after
445	rehash_after=$(get_netstat_counter "$NS1" TcpTimeoutRehash)
446	if [ "$rehash_after" -le "$rehash_before" ]; then
447		reason="${reason:+$reason; }TcpTimeoutRehash did not increment"
448	fi
449
450	unblock_tcp "$NS1" "veth${active_idx}a"
451	ip netns exec "$NS1" sysctl -qw \
452		net.ipv4.tcp_retries1="$saved_retries1"
453	kill "$client_pid" "$server_pid" 2>/dev/null
454	wait "$client_pid" "$server_pid" 2>/dev/null
455	if [ -n "$reason" ]; then
456		echo "$reason"
457		return 1
458	fi
459	return 0
460}
461
462test_ecmp_midstream_rehash()
463{
464	RET=0
465	local port retry_port
466	alloc_ports port
467	alloc_ports retry_port
468
469	local fail_reason
470	if ! ecmp_midstream_rehash_attempt "$port" >/dev/null; then
471		fail_reason=$(ecmp_midstream_rehash_attempt "$retry_port")
472		check_err $? "$fail_reason"
473	fi
474
475	log_test "Local ECMP midstream rehash: block active path"
476}
477
478# Single attempt at the ACK rehash check.  Returns 0 on success.
479ecmp_ack_rehash_attempt()
480{
481	local port=$1; shift
482	local reason=""
483
484	ip netns exec "$NS2" socat -u \
485		"TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" - >/dev/null &
486	local server_pid=$!
487
488	wait_local_port_listen "$NS2" "$port" tcp
489
490	local base_tx0 base_tx1
491	base_tx0=$(link_tx_packets_get "$NS2" veth0b)
492	base_tx1=$(link_tx_packets_get "$NS2" veth1b)
493
494	# Continuous data source from NS1 to NS2.  Cap the send buffer
495	# so in-flight data stays below the receiver's advertised window.
496	# Without this, the sender can exhaust the receiver's window and
497	# enter persist mode (zero-window probing) instead of RTO when
498	# ACKs are blocked, and persist probes do not trigger flowlabel
499	# rehash.
500	ip netns exec "$NS1" timeout 120 socat -u \
501		OPEN:/dev/zero \
502		"TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],sndbuf=16384" \
503		&>/dev/null &
504	local client_pid=$!
505
506	# Wait for enough server TX (ACKs) to identify the active return path.
507	if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \
508			">= $((base_tx0 + base_tx1 + 10))" \
509		link_tx_packets_total "$NS2" veth0b veth1b > /dev/null; then
510		kill "$client_pid" "$server_pid" 2>/dev/null
511		wait "$client_pid" "$server_pid" 2>/dev/null
512		echo "no server TX activity"
513		return 1
514	fi
515
516	local cur_tx0 cur_tx1 active_dev inactive_dev
517	cur_tx0=$(link_tx_packets_get "$NS2" veth0b)
518	cur_tx1=$(link_tx_packets_get "$NS2" veth1b)
519	if [ $((cur_tx0 - base_tx0)) -ge $((cur_tx1 - base_tx1)) ]; then
520		active_dev=veth0b; inactive_dev=veth1b
521	else
522		active_dev=veth1b; inactive_dev=veth0b
523	fi
524
525	local rehash_before
526	rehash_before=$(get_netstat_counter "$NS2" TcpDuplicateDataRehash)
527
528	# Block the inactive return path first (no effect on current
529	# ACK flow), then block the active path.  This avoids counting
530	# normal ACK drops as rehash evidence.
531	block_tcp "$NS2" "$inactive_dev"
532	local inactive_before
533	inactive_before=$(tc_filter_pkt_count "$NS2" "$inactive_dev")
534	block_tcp "$NS2" "$active_dev"
535
536	# NS1 will RTO (no ACKs), retransmit with new flowlabel.
537	# NS2 detects the flowlabel change via tcp_rcv_spurious_retrans(),
538	# rehashes, and NS2's ACKs try the previously inactive return
539	# path.  One successful rehash is sufficient.
540	if ! slowwait 60 until_counter_is \
541			">= $((${inactive_before:-0} + 1))" \
542		tc_filter_pkt_count "$NS2" "$inactive_dev" > /dev/null; then
543		reason="no ACKs on alternate return path after blocking"
544	fi
545
546	local rehash_after
547	rehash_after=$(get_netstat_counter "$NS2" TcpDuplicateDataRehash)
548	if [ "$rehash_after" -le "$rehash_before" ]; then
549		reason="${reason:+$reason; }TcpDuplicateDataRehash did not increment"
550	fi
551
552	unblock_tcp "$NS2" "$active_dev"
553	unblock_tcp "$NS2" "$inactive_dev"
554	kill "$client_pid" "$server_pid" 2>/dev/null
555	wait "$client_pid" "$server_pid" 2>/dev/null
556	if [ -n "$reason" ]; then
557		echo "$reason"
558		return 1
559	fi
560	return 0
561}
562
563# Block the receiver's (NS2) ACK return paths while data flows from
564# NS1 to NS2.  The sender (NS1) times out and retransmits with a new
565# flowlabel; the receiver detects the changed flowlabel via
566# tcp_rcv_spurious_retrans() and rehashes its own txhash so that its
567# ACKs try a different ECMP return path.
568#
569# With 2-way ECMP each rehash may pick the same path, so a single
570# attempt can occasionally fail.  Retry once for robustness.
571test_ecmp_midstream_ack_rehash()
572{
573	RET=0
574	local port retry_port
575	alloc_ports port
576	alloc_ports retry_port
577
578	local fail_reason
579	if ! ecmp_ack_rehash_attempt "$port" >/dev/null; then
580		fail_reason=$(ecmp_ack_rehash_attempt "$retry_port")
581		check_err $? "$fail_reason"
582	fi
583
584	log_test "Local ECMP midstream ACK rehash: blocked return path"
585}
586
587# Establish a DCTCP data transfer with PLB enabled, then ECN-mark both
588# paths.  Sustained CE marking triggers PLB to call sk_rethink_txhash()
589# + __sk_dst_reset(), bouncing the connection between ECMP paths.
590# Verify data appears on both paths and that TCPPLBRehash incremented.
591test_ecmp_plb_rehash()
592{
593	RET=0
594	local port
595	alloc_ports port
596
597	# PLB needs DCTCP, a restricted congestion control.  Adding it to
598	# the host-global tcp_allowed_congestion_control would relax the
599	# restricted-CC policy for the whole host (there is no per-netns
600	# allowed set).  Instead pin dctcp on the test routes with
601	# "congctl": the route's RTAX_CC_ALGO is honoured on both the
602	# connect and accept paths without the restricted-CC check, and a
603	# dctcp route also carries DST_FEATURE_ECN_CA so the server
604	# negotiates ECN -- all confined to the test namespaces.
605	local available
606	available=$(ip netns exec "$NS1" sysctl -n \
607		net.ipv4.tcp_available_congestion_control)
608	if ! echo "$available" | grep -qw dctcp; then
609		log_test_skip "Local ECMP PLB rehash: DCTCP not available"
610		return "$ksft_skip"
611	fi
612	install_ecmp_routes change dctcp
613	defer install_ecmp_routes change
614
615	# Save NS1 sysctls before modifying them.
616	local saved_ecn1 saved_plb_enabled saved_plb_rounds
617	local saved_plb_thresh saved_plb_suspend
618	saved_ecn1=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_ecn)
619	saved_plb_enabled=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_plb_enabled)
620	saved_plb_rounds=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_plb_rehash_rounds)
621	saved_plb_thresh=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_plb_cong_thresh)
622	saved_plb_suspend=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_plb_suspend_rto_sec)
623
624	# Enable ECN and PLB on the sender; dctcp comes from the route.
625	ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_ecn=1
626	ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_enabled=1
627	ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_rehash_rounds=3
628	ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_cong_thresh=1
629	ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_suspend_rto_sec=0
630	defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_ecn="$saved_ecn1"
631	defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_enabled="$saved_plb_enabled"
632	defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_rehash_rounds="$saved_plb_rounds"
633	defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_cong_thresh="$saved_plb_thresh"
634	defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_suspend_rto_sec="$saved_plb_suspend"
635
636	ip netns exec "$NS2" socat -u \
637		"TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" - >/dev/null &
638	defer kill_process $!
639
640	wait_local_port_listen "$NS2" "$port" tcp
641
642	local base_tx0 base_tx1
643	base_tx0=$(link_tx_packets_get "$NS1" veth0a)
644	base_tx1=$(link_tx_packets_get "$NS1" veth1a)
645
646	ip netns exec "$NS1" timeout 90 socat -u \
647		OPEN:/dev/zero \
648		"TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1]" &>/dev/null &
649	local client_pid=$!
650	defer kill_process "$client_pid"
651
652	# Wait for data to start flowing before applying ECN marking.
653	busywait "$BUSYWAIT_TIMEOUT" until_counter_is \
654			">= $((base_tx0 + base_tx1 + 10))" \
655		link_tx_packets_total "$NS1" > /dev/null
656	check_err $? "no TX activity detected"
657	if [ "$RET" -ne 0 ]; then
658		log_test "Local ECMP PLB rehash: ECN-marked path"
659		return
660	fi
661
662	# Snapshot TX counters and rehash stats before ECN marking.
663	local pre_ecn_tx0 pre_ecn_tx1
664	pre_ecn_tx0=$(link_tx_packets_get "$NS1" veth0a)
665	pre_ecn_tx1=$(link_tx_packets_get "$NS1" veth1a)
666
667	local plb_before rto_before
668	plb_before=$(get_netstat_counter "$NS1" TCPPLBRehash)
669	rto_before=$(get_netstat_counter "$NS1" TcpTimeoutRehash)
670
671	# CE-mark all data on both paths.  PLB detects sustained
672	# congestion and rehashes, bouncing traffic between paths.
673	mark_ecn "$NS1" veth0a
674	defer unblock_tcp "$NS1" veth0a	# removes the marking rule
675	mark_ecn "$NS1" veth1a
676	defer unblock_tcp "$NS1" veth1a	# removes the marking rule
677
678	# Wait for meaningful data on both paths, proving PLB rehashed
679	# the connection and traffic actually moved.  Require at least
680	# 100 packets beyond the baseline to rule out stray control
681	# packets (ND, etc.) satisfying the check.
682	slowwait 60 dev_tx_packets_above \
683		"$NS1" veth0a "$((pre_ecn_tx0 + 100))" > /dev/null
684	check_err $? "no data on veth0a after ECN marking"
685
686	slowwait 60 dev_tx_packets_above \
687		"$NS1" veth1a "$((pre_ecn_tx1 + 100))" > /dev/null
688	check_err $? "no data on veth1a after ECN marking"
689
690	local plb_after rto_after
691	plb_after=$(get_netstat_counter "$NS1" TCPPLBRehash)
692	rto_after=$(get_netstat_counter "$NS1" TcpTimeoutRehash)
693	if [ "$plb_after" -le "$plb_before" ]; then
694		check_err 1 "TCPPLBRehash counter did not increment"
695	fi
696	if [ "$rto_after" -gt "$rto_before" ]; then
697		check_err 1 "TcpTimeoutRehash incremented; rehash was RTO-driven, not PLB"
698	fi
699
700	log_test "Local ECMP PLB rehash: ECN-marked path"
701}
702
703# Verify that hash policy 1 (L3+L4 symmetric) preserves the ECMP path
704# across rehash.  Policy 1 computes a deterministic hash from the
705# 5-tuple, so mp_hash stays 0 and rt6_multipath_hash() always selects
706# the same path regardless of txhash changes.
707test_ecmp_hash_policy1_no_rehash()
708{
709	RET=0
710	local port
711	alloc_ports port
712
713	local saved_policy
714	saved_policy=$(ip netns exec "$NS1" sysctl -n \
715		net.ipv6.fib_multipath_hash_policy)
716	ip netns exec "$NS1" sysctl -qw net.ipv6.fib_multipath_hash_policy=1
717	defer ip netns exec "$NS1" sysctl -qw \
718		net.ipv6.fib_multipath_hash_policy="$saved_policy"
719
720	block_tcp "$NS1" veth0a
721	defer unblock_tcp "$NS1" veth0a
722	block_tcp "$NS1" veth1a
723	defer unblock_tcp "$NS1" veth1a
724
725	ip netns exec "$NS2" socat \
726		"TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr,fork" \
727		EXEC:"echo POLICY1_OK" &
728	defer kill_process $!
729
730	wait_local_port_listen "$NS2" "$port" tcp
731
732	local rehash_before
733	rehash_before=$(get_netstat_counter "$NS1" TcpTimeoutRehash)
734
735	ip netns exec "$NS1" timeout 10 socat -u \
736		"TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],connect-timeout=8" \
737		STDOUT >/dev/null 2>&1 &
738	local client_pid=$!
739	defer kill_process "$client_pid"
740
741	# With policy 1, the deterministic 5-tuple hash always selects
742	# the same path.  Wait for multiple SYN retransmits (proving
743	# rehash was attempted), then verify all SYNs landed on the
744	# same interface.
745	local rehash_after
746	slowwait 8 until_counter_is ">= $((rehash_before + 3))" \
747		get_netstat_counter "$NS1" TcpTimeoutRehash > /dev/null
748	rehash_after=$(get_netstat_counter "$NS1" TcpTimeoutRehash)
749	if [ "$rehash_after" -le "$rehash_before" ]; then
750		check_err 1 "TcpTimeoutRehash counter did not increment"
751	fi
752
753	local c0 c1
754	c0=$(tc_filter_pkt_count "$NS1" veth0a)
755	c1=$(tc_filter_pkt_count "$NS1" veth1a)
756	if [ "${c0:-0}" -ge 1 ] && [ "${c1:-0}" -ge 1 ]; then
757		check_err 1 "SYNs appeared on both paths despite policy 1"
758	fi
759	if [ "${c0:-0}" -eq 0 ] && [ "${c1:-0}" -eq 0 ]; then
760		check_err 1 "no SYNs observed on either path"
761	fi
762
763	log_test "Local ECMP policy 1: no path change on rehash"
764}
765
766# Verify that mp_hash does not leak into the on-wire flowlabel.
767# With auto_flowlabels=0, the wire flowlabel must be 0.  Install tc
768# filters that pass TCP with flowlabel=0 but drop TCP with nonzero
769# flowlabel, then establish a connection and transfer data.  If
770# mp_hash leaked into fl6->flowlabel, the SYN or data packets would
771# be dropped and the connection would fail.
772test_ecmp_no_flowlabel_leak()
773{
774	RET=0
775	local port
776	alloc_ports port
777
778	local saved_afl
779	saved_afl=$(ip netns exec "$NS1" sysctl -n \
780		net.ipv6.auto_flowlabels)
781	ip netns exec "$NS1" sysctl -qw net.ipv6.auto_flowlabels=0
782	defer ip netns exec "$NS1" sysctl -qw \
783		net.ipv6.auto_flowlabels="$saved_afl"
784
785	# On both egress interfaces: pass TCP with flowlabel=0 (prio 1),
786	# drop any remaining TCP (nonzero flowlabel, prio 2).  ICMPv6
787	# matches neither filter and passes through normally.
788	local dev
789	for dev in veth0a veth1a; do
790		ip netns exec "$NS1" tc qdisc add dev "$dev" \
791			root handle 1: prio
792		ip netns exec "$NS1" tc filter add dev "$dev" parent 1: \
793			protocol ipv6 prio 1 u32 \
794			match u32 0x00000000 0x000FFFFF at 0 \
795			match u8 0x06 0xff at 6 \
796			action ok
797		ip netns exec "$NS1" tc filter add dev "$dev" parent 1: \
798			protocol ipv6 prio 2 u32 \
799			match u8 0x06 0xff at 6 \
800			action drop
801		defer unblock_tcp "$NS1" "$dev"
802	done
803
804	ip netns exec "$NS2" socat \
805		"TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" \
806		EXEC:"echo FLOWLABEL_OK" &
807	defer kill_process $!
808
809	wait_local_port_listen "$NS2" "$port" tcp
810
811	local tmpfile
812	tmpfile=$(mktemp)
813	defer rm -f "$tmpfile"
814
815	ip netns exec "$NS1" socat -u \
816		"TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],connect-timeout=10" \
817		STDOUT >"$tmpfile" 2>&1
818
819	local result
820	result=$(cat "$tmpfile" 2>/dev/null)
821	if [[ "$result" != *"FLOWLABEL_OK"* ]]; then
822		check_err 1 "connection failed: mp_hash may have leaked into wire flowlabel"
823	fi
824
825	log_test "No flowlabel leak with auto_flowlabels=0"
826}
827
828# Helper: stream data, invalidate the cached dst by adding and
829# removing a dummy route (bumps fib6_node sernum), then check that
830# traffic stays on the same ECMP path.  Used by both the normal
831# tcp_v6_connect and syncookie variants.
832ecmp_dst_rebuild_check()
833{
834	local ns_client=$1; shift
835	local port=$1; shift
836	local rc=0
837
838	# Suppress __dst_negative_advice() during the test so that a
839	# real TCP timeout cannot trigger an additional dst
840	# invalidation via a different code path.
841	local saved_retries1
842	saved_retries1=$(ip netns exec "$ns_client" sysctl -n \
843		net.ipv4.tcp_retries1)
844	ip netns exec "$ns_client" sysctl -qw net.ipv4.tcp_retries1=255
845
846	local base0 base1
847	base0=$(link_tx_packets_get "$ns_client" veth0a)
848	base1=$(link_tx_packets_get "$ns_client" veth1a)
849
850	ip netns exec "$ns_client" timeout 15 socat -u \
851		OPEN:/dev/zero \
852		"TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1]" \
853		&>/dev/null &
854	local client_pid=$!
855
856	# Wait for enough packets to identify the active path.
857	# Return 2 for setup failure (distinct from 1 = path changed).
858	if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \
859			">= $((base0 + base1 + 50))" \
860		link_tx_packets_total "$ns_client" > /dev/null; then
861		ip netns exec "$ns_client" sysctl -qw \
862			net.ipv4.tcp_retries1="$saved_retries1"
863		kill "$client_pid" 2>/dev/null
864		wait "$client_pid" 2>/dev/null
865		return 2
866	fi
867
868	local mid0 mid1 active_dev inactive_dev
869	mid0=$(link_tx_packets_get "$ns_client" veth0a)
870	mid1=$(link_tx_packets_get "$ns_client" veth1a)
871	if [ $((mid0 - base0)) -ge $((mid1 - base1)) ]; then
872		active_dev=veth0a; inactive_dev=veth1a
873	else
874		active_dev=veth1a; inactive_dev=veth0a
875	fi
876
877	local active_before inactive_before
878	active_before=$(link_tx_packets_get "$ns_client" "$active_dev")
879	inactive_before=$(link_tx_packets_get "$ns_client" "$inactive_dev")
880
881	# Invalidate the cached dst by bumping the fib6_node sernum.
882	# Adding and removing a high-metric dummy route achieves this
883	# without touching the ECMP nexthops, avoiding a transient
884	# single-nexthop state during multipath route replace.
885	ip -n "$ns_client" -6 route add fd00:ff::2/128 dev lo metric 9999
886	ip -n "$ns_client" -6 route del fd00:ff::2/128 dev lo metric 9999
887
888	# Wait for enough post-rebuild traffic to detect a path change.
889	if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \
890			">= $((active_before + inactive_before + 50))" \
891		link_tx_packets_total "$ns_client" > /dev/null; then
892		ip netns exec "$ns_client" sysctl -qw \
893			net.ipv4.tcp_retries1="$saved_retries1"
894		kill "$client_pid" 2>/dev/null
895		wait "$client_pid" 2>/dev/null
896		return 2
897	fi
898
899	local active_after inactive_after
900	active_after=$(link_tx_packets_get "$ns_client" "$active_dev")
901	inactive_after=$(link_tx_packets_get "$ns_client" "$inactive_dev")
902
903	local active_delta=$((active_after - active_before))
904	local inactive_delta=$((inactive_after - inactive_before))
905
906	if [ "$inactive_delta" -gt "$active_delta" ]; then
907		rc=1
908	fi
909
910	ip netns exec "$ns_client" sysctl -qw \
911		net.ipv4.tcp_retries1="$saved_retries1"
912	kill "$client_pid" 2>/dev/null
913	wait "$client_pid" 2>/dev/null
914	return "$rc"
915}
916
917# Run ecmp_dst_rebuild_check for ECMP_REBUILD_ROUNDS rounds, each with
918# a fresh server and connection.  With a correct kernel the path is
919# deterministic (same txhash always selects the same ECMP nexthop),
920# so any path change is a bug.  Multiple rounds catch a buggy kernel
921# that picks a random path: each round has 50% chance of accidentally
922# matching, so 10 rounds gives < 0.1% false-pass probability.
923ecmp_dst_rebuild_loop()
924{
925	local base_port=$1; shift
926	local label=$1; shift
927	local path_changes=0
928	local r
929
930	for r in $(seq 1 "$ECMP_REBUILD_ROUNDS"); do
931		local port=$((base_port + r - 1))
932
933		ip netns exec "$NS2" socat -u \
934			"TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" \
935			- >/dev/null &
936		local server_pid=$!
937
938		wait_local_port_listen "$NS2" "$port" tcp
939
940		local check_rc=0
941		ecmp_dst_rebuild_check "$NS1" "$port" || check_rc=$?
942
943		kill "$server_pid" 2>/dev/null
944		wait "$server_pid" 2>/dev/null
945
946		busywait "$BUSYWAIT_TIMEOUT" \
947			port_has_no_active_tcp "$NS1" "$port" > /dev/null
948		busywait "$BUSYWAIT_TIMEOUT" \
949			port_has_no_active_tcp "$NS2" "$port" > /dev/null
950
951		if [ "$check_rc" -eq 2 ]; then
952			check_err 1 "no TX activity in round $r"
953			break
954		elif [ "$check_rc" -eq 1 ]; then
955			path_changes=$((path_changes + 1))
956		fi
957	done
958
959	if [ "$path_changes" -gt 0 ]; then
960		check_err 1 "$path_changes/$ECMP_REBUILD_ROUNDS changed path"
961	fi
962
963	log_test "$label"
964}
965
966# Verify that a dst invalidation does not cause the connection to
967# switch ECMP paths.  With the fix, both the initial route lookup
968# (tcp_v6_connect) and subsequent rebuilds (inet6_csk_route_socket)
969# use sk_txhash >> 1, so the path is stable.
970test_ecmp_dst_rebuild_consistency()
971{
972	RET=0
973	local base_port
974	alloc_ports base_port "$ECMP_REBUILD_ROUNDS"
975
976	ecmp_dst_rebuild_loop "$base_port" \
977		"ECMP path stable after dst invalidation"
978}
979
980# Return 0 (true) when no active TCP sockets remain on a port.
981# TIME_WAIT is excluded because it does not generate outgoing traffic.
982port_has_no_active_tcp()
983{
984	local ns=$1; shift
985	local port=$1; shift
986
987	! ip netns exec "$ns" ss -tnH \
988		state established \
989		state fin-wait-1 \
990		state fin-wait-2 \
991		state close-wait \
992		state last-ack \
993		state closing \
994		state syn-sent \
995		state syn-recv \
996		"sport = :$port or dport = :$port" | grep -q .
997}
998
999# Count TCP packets on server egress without blocking them.
1000# Uses tc filters with "action ok" so packets are counted and passed.
1001count_tcp()
1002{
1003	local ns=$1; shift
1004	local dev=$1; shift
1005
1006	ip netns exec "$ns" tc qdisc add dev "$dev" root handle 1: prio
1007	ip netns exec "$ns" tc filter add dev "$dev" parent 1: \
1008		protocol ipv6 prio 1 u32 match u8 0x06 0xff at 6 action ok
1009}
1010
1011# Verify that the server's SYN-ACK (sent from the request socket) and
1012# subsequent ACKs (sent from the full socket created in cookie_v6_check)
1013# use the same ECMP path.  With syncookies the request socket is freed
1014# after the SYN-ACK and a new one is created during cookie validation;
1015# this test catches the case where the two request sockets pick
1016# different ECMP paths due to independent txhash values.
1017test_ecmp_syncookie_path_consistency()
1018{
1019	RET=0
1020
1021	local saved_syncookies
1022	saved_syncookies=$(ip netns exec "$NS2" sysctl -n \
1023		net.ipv4.tcp_syncookies 2>/dev/null)
1024	if [ -z "$saved_syncookies" ]; then
1025		log_test_skip "Syncookie server ECMP path consistent"
1026		return "$ksft_skip"
1027	fi
1028	ip netns exec "$NS2" sysctl -qw net.ipv4.tcp_syncookies=2
1029	defer ip netns exec "$NS2" sysctl -qw \
1030		net.ipv4.tcp_syncookies="$saved_syncookies"
1031
1032	count_tcp "$NS2" veth0b
1033	defer unblock_tcp "$NS2" veth0b
1034	count_tcp "$NS2" veth1b
1035	defer unblock_tcp "$NS2" veth1b
1036
1037	local path_splits=0
1038	local r base_port
1039	alloc_ports base_port "$ECMP_REBUILD_ROUNDS"
1040
1041	for r in $(seq 1 "$ECMP_REBUILD_ROUNDS"); do
1042		local port=$((base_port + r - 1))
1043
1044		ip netns exec "$NS2" socat -u \
1045			"TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" \
1046			- >/dev/null &
1047		local server_pid=$!
1048
1049		wait_local_port_listen "$NS2" "$port" tcp
1050
1051		local srv_base0 srv_base1
1052		srv_base0=$(tc_filter_pkt_count "$NS2" veth0b)
1053		srv_base1=$(tc_filter_pkt_count "$NS2" veth1b)
1054
1055		ip netns exec "$NS1" timeout 5 socat -u \
1056			OPEN:/dev/zero \
1057			"TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1]" \
1058			&>/dev/null &
1059		local client_pid=$!
1060
1061		local cli_base
1062		cli_base=$(link_tx_packets_total "$NS1")
1063		if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \
1064				">= $((cli_base + 200))" \
1065			link_tx_packets_total "$NS1" > /dev/null; then
1066			check_err 1 "no TX activity in round $r"
1067			kill "$client_pid" 2>/dev/null
1068			wait "$client_pid" 2>/dev/null
1069			kill "$server_pid" 2>/dev/null
1070			wait "$server_pid" 2>/dev/null
1071			break
1072		fi
1073
1074		local srv_tcp0 srv_tcp1
1075		srv_tcp0=$(tc_filter_pkt_count "$NS2" veth0b)
1076		srv_tcp1=$(tc_filter_pkt_count "$NS2" veth1b)
1077		local srv_delta0=$(( ${srv_tcp0:-0} - ${srv_base0:-0} ))
1078		local srv_delta1=$(( ${srv_tcp1:-0} - ${srv_base1:-0} ))
1079
1080		if [ "$srv_delta0" -gt 0 ] && [ "$srv_delta1" -gt 0 ]; then
1081			path_splits=$((path_splits + 1))
1082		fi
1083
1084		kill "$client_pid" 2>/dev/null
1085		wait "$client_pid" 2>/dev/null
1086		kill "$server_pid" 2>/dev/null
1087		wait "$server_pid" 2>/dev/null
1088
1089		# Wait for TCP teardown packets (FIN/RST) to finish so
1090		# they do not pollute the next round's tc filter counters.
1091		busywait "$BUSYWAIT_TIMEOUT" \
1092			port_has_no_active_tcp "$NS1" "$port" > /dev/null
1093		busywait "$BUSYWAIT_TIMEOUT" \
1094			port_has_no_active_tcp "$NS2" "$port" > /dev/null
1095	done
1096
1097	if [ "$path_splits" -gt 0 ]; then
1098		check_err 1 "$path_splits/$ECMP_REBUILD_ROUNDS had split server path"
1099	fi
1100
1101	log_test "Syncookie server ECMP path consistent"
1102}
1103
1104require_command socat
1105
1106trap 'defer_scopes_cleanup; cleanup_all_ns' EXIT
1107setup || exit $?
1108tests_run
1109exit "$EXIT_STATUS"
1110