xref: /linux/tools/testing/selftests/net/tcp_ecmp_failover.sh (revision fcee7d82f27d6a8b1ddc5bbefda59b4e441e9bc0)
1*d1ae37dcSKuniyuki Iwashima#!/bin/bash
2*d1ae37dcSKuniyuki Iwashima# SPDX-License-Identifier: GPL-2.0
3*d1ae37dcSKuniyuki Iwashima#
4*d1ae37dcSKuniyuki Iwashima# Copyright 2026 Google LLC.
5*d1ae37dcSKuniyuki Iwashima#
6*d1ae37dcSKuniyuki Iwashima# This test verifies TCP flow failover between ECMP routes
7*d1ae37dcSKuniyuki Iwashima# upon carrier loss on the active device.
8*d1ae37dcSKuniyuki Iwashima#
9*d1ae37dcSKuniyuki Iwashima#   socat  ----------------------------->  socat
10*d1ae37dcSKuniyuki Iwashima#                        |
11*d1ae37dcSKuniyuki Iwashima#           .-- veth-c1 -|- veth-s1 --.
12*d1ae37dcSKuniyuki Iwashima#   dummy0 -|            |            |-- dummy0
13*d1ae37dcSKuniyuki Iwashima#           '-- veth-c2 -|- veth-s2 --'
14*d1ae37dcSKuniyuki Iwashima#                        |
15*d1ae37dcSKuniyuki Iwashima#
16*d1ae37dcSKuniyuki Iwashima
17*d1ae37dcSKuniyuki IwashimaREQUIRE_JQ=no
18*d1ae37dcSKuniyuki IwashimaREQUIRE_MZ=no
19*d1ae37dcSKuniyuki IwashimaNUM_NETIFS=0
20*d1ae37dcSKuniyuki Iwashima
21*d1ae37dcSKuniyuki Iwashimasource forwarding/lib.sh
22*d1ae37dcSKuniyuki Iwashima
23*d1ae37dcSKuniyuki IwashimaCLIENT_IP="10.0.59.1"
24*d1ae37dcSKuniyuki IwashimaSERVER_IP="10.0.92.1"
25*d1ae37dcSKuniyuki IwashimaCLIENT_IP6="2001:db8:5a9a::1"
26*d1ae37dcSKuniyuki IwashimaSERVER_IP6="2001:db8:9292::1"
27*d1ae37dcSKuniyuki Iwashima
28*d1ae37dcSKuniyuki Iwashimasetup_server()
29*d1ae37dcSKuniyuki Iwashima{
30*d1ae37dcSKuniyuki Iwashima	IP="ip -n $server"
31*d1ae37dcSKuniyuki Iwashima	NS_EXEC="ip netns exec $server"
32*d1ae37dcSKuniyuki Iwashima
33*d1ae37dcSKuniyuki Iwashima	$IP link add dummy0 type dummy
34*d1ae37dcSKuniyuki Iwashima	$IP link set dummy0 up
35*d1ae37dcSKuniyuki Iwashima
36*d1ae37dcSKuniyuki Iwashima	$IP -4 addr add $SERVER_IP/32 dev dummy0
37*d1ae37dcSKuniyuki Iwashima	$IP -6 addr add $SERVER_IP6/128 dev dummy0 nodad
38*d1ae37dcSKuniyuki Iwashima
39*d1ae37dcSKuniyuki Iwashima	$IP link set veth-s1 up
40*d1ae37dcSKuniyuki Iwashima	$IP link set veth-s2 up
41*d1ae37dcSKuniyuki Iwashima
42*d1ae37dcSKuniyuki Iwashima	$IP -4 addr add 192.168.1.2/24 dev veth-s1
43*d1ae37dcSKuniyuki Iwashima	$IP -4 addr add 192.168.2.2/24 dev veth-s2
44*d1ae37dcSKuniyuki Iwashima
45*d1ae37dcSKuniyuki Iwashima	$IP -4 route add $CLIENT_IP/32 \
46*d1ae37dcSKuniyuki Iwashima		nexthop via 192.168.1.1 dev veth-s1 weight 1 \
47*d1ae37dcSKuniyuki Iwashima		nexthop via 192.168.2.1 dev veth-s2 weight 1
48*d1ae37dcSKuniyuki Iwashima
49*d1ae37dcSKuniyuki Iwashima	$IP -6 addr add 2001:db8:1::2/64 dev veth-s1 nodad
50*d1ae37dcSKuniyuki Iwashima	$IP -6 addr add 2001:db8:2::2/64 dev veth-s2 nodad
51*d1ae37dcSKuniyuki Iwashima
52*d1ae37dcSKuniyuki Iwashima	$IP -6 route add $CLIENT_IP6/128 \
53*d1ae37dcSKuniyuki Iwashima		nexthop via 2001:db8:1::1 dev veth-s1 weight 1 \
54*d1ae37dcSKuniyuki Iwashima		nexthop via 2001:db8:2::1 dev veth-s2 weight 1
55*d1ae37dcSKuniyuki Iwashima}
56*d1ae37dcSKuniyuki Iwashima
57*d1ae37dcSKuniyuki Iwashimasetup_client()
58*d1ae37dcSKuniyuki Iwashima{
59*d1ae37dcSKuniyuki Iwashima	IP="ip -n $client"
60*d1ae37dcSKuniyuki Iwashima	NS_EXEC="ip netns exec $client"
61*d1ae37dcSKuniyuki Iwashima
62*d1ae37dcSKuniyuki Iwashima	$IP link add dummy0 type dummy
63*d1ae37dcSKuniyuki Iwashima	$IP link set dummy0 up
64*d1ae37dcSKuniyuki Iwashima
65*d1ae37dcSKuniyuki Iwashima	$IP -4 addr add $CLIENT_IP/32 dev dummy0
66*d1ae37dcSKuniyuki Iwashima	$IP -6 addr add $CLIENT_IP6/128 dev dummy0 nodad
67*d1ae37dcSKuniyuki Iwashima
68*d1ae37dcSKuniyuki Iwashima	$IP link set veth-c1 up
69*d1ae37dcSKuniyuki Iwashima	$IP link set veth-c2 up
70*d1ae37dcSKuniyuki Iwashima
71*d1ae37dcSKuniyuki Iwashima	$IP -4 addr add 192.168.1.1/24 dev veth-c1
72*d1ae37dcSKuniyuki Iwashima	$IP -4 addr add 192.168.2.1/24 dev veth-c2
73*d1ae37dcSKuniyuki Iwashima
74*d1ae37dcSKuniyuki Iwashima	$IP -4 route add $SERVER_IP/32 \
75*d1ae37dcSKuniyuki Iwashima		nexthop via 192.168.1.2 dev veth-c1 weight 1 \
76*d1ae37dcSKuniyuki Iwashima		nexthop via 192.168.2.2 dev veth-c2 weight 1
77*d1ae37dcSKuniyuki Iwashima
78*d1ae37dcSKuniyuki Iwashima	$IP -6 addr add 2001:db8:1::1/64 dev veth-c1 nodad
79*d1ae37dcSKuniyuki Iwashima	$IP -6 addr add 2001:db8:2::1/64 dev veth-c2 nodad
80*d1ae37dcSKuniyuki Iwashima
81*d1ae37dcSKuniyuki Iwashima	$IP -6 route add $SERVER_IP6/128 \
82*d1ae37dcSKuniyuki Iwashima		nexthop via 2001:db8:1::2 dev veth-c1 weight 1 \
83*d1ae37dcSKuniyuki Iwashima		nexthop via 2001:db8:2::2 dev veth-c2 weight 1
84*d1ae37dcSKuniyuki Iwashima
85*d1ae37dcSKuniyuki Iwashima	# By default, tcp_retries1=3 triggers a route refresh
86*d1ae37dcSKuniyuki Iwashima	# after 3 retransmits (~5s).  Ensure this never occurs
87*d1ae37dcSKuniyuki Iwashima	# for test stability.
88*d1ae37dcSKuniyuki Iwashima	$NS_EXEC sysctl -qw net.ipv4.tcp_retries1=100
89*d1ae37dcSKuniyuki Iwashima
90*d1ae37dcSKuniyuki Iwashima	# When NETDEV_CHANGE is issued for a dev tied to an ECMP
91*d1ae37dcSKuniyuki Iwashima	# route, RTNH_F_LINKDOWN is flagged and the sernum is
92*d1ae37dcSKuniyuki Iwashima	# bumped to invalidate the route via sk_dst_check().
93*d1ae37dcSKuniyuki Iwashima	#
94*d1ae37dcSKuniyuki Iwashima	# Without ignore_routes_with_linkdown=1, subsequent
95*d1ae37dcSKuniyuki Iwashima	# lookups may still select the same RTNH_F_LINKDOWN route.
96*d1ae37dcSKuniyuki Iwashima	$NS_EXEC sysctl -qw net.ipv4.conf.veth-c1.ignore_routes_with_linkdown=1
97*d1ae37dcSKuniyuki Iwashima	$NS_EXEC sysctl -qw net.ipv4.conf.veth-c2.ignore_routes_with_linkdown=1
98*d1ae37dcSKuniyuki Iwashima
99*d1ae37dcSKuniyuki Iwashima	$NS_EXEC sysctl -qw net.ipv6.conf.veth-c1.ignore_routes_with_linkdown=1
100*d1ae37dcSKuniyuki Iwashima	$NS_EXEC sysctl -qw net.ipv6.conf.veth-c2.ignore_routes_with_linkdown=1
101*d1ae37dcSKuniyuki Iwashima}
102*d1ae37dcSKuniyuki Iwashima
103*d1ae37dcSKuniyuki Iwashimasetup()
104*d1ae37dcSKuniyuki Iwashima{
105*d1ae37dcSKuniyuki Iwashima	setup_ns client server
106*d1ae37dcSKuniyuki Iwashima
107*d1ae37dcSKuniyuki Iwashima	ip -n "$client" link add veth-c1 type veth peer veth-s1 netns "$server"
108*d1ae37dcSKuniyuki Iwashima	ip -n "$client" link add veth-c2 type veth peer veth-s2 netns "$server"
109*d1ae37dcSKuniyuki Iwashima
110*d1ae37dcSKuniyuki Iwashima	setup_server
111*d1ae37dcSKuniyuki Iwashima	setup_client
112*d1ae37dcSKuniyuki Iwashima}
113*d1ae37dcSKuniyuki Iwashima
114*d1ae37dcSKuniyuki Iwashimacleanup()
115*d1ae37dcSKuniyuki Iwashima{
116*d1ae37dcSKuniyuki Iwashima	cleanup_all_ns > /dev/null 2>&1
117*d1ae37dcSKuniyuki Iwashima}
118*d1ae37dcSKuniyuki Iwashima
119*d1ae37dcSKuniyuki Iwashimatcp_ecmp_failover()
120*d1ae37dcSKuniyuki Iwashima{
121*d1ae37dcSKuniyuki Iwashima	local pf=$1; shift
122*d1ae37dcSKuniyuki Iwashima	local server_ip=$1; shift
123*d1ae37dcSKuniyuki Iwashima	local client_ip=$1; shift
124*d1ae37dcSKuniyuki Iwashima
125*d1ae37dcSKuniyuki Iwashima	RET=0
126*d1ae37dcSKuniyuki Iwashima
127*d1ae37dcSKuniyuki Iwashima	tcpdump_start veth-s1 "$server"
128*d1ae37dcSKuniyuki Iwashima	tcpdump_start veth-s2 "$server"
129*d1ae37dcSKuniyuki Iwashima
130*d1ae37dcSKuniyuki Iwashima	ip netns exec "$server" \
131*d1ae37dcSKuniyuki Iwashima		socat -u TCP-LISTEN:8080,pf="$pf",bind="$server_ip",reuseaddr /dev/null &
132*d1ae37dcSKuniyuki Iwashima	server_pid=$!
133*d1ae37dcSKuniyuki Iwashima
134*d1ae37dcSKuniyuki Iwashima	# Wait for server to start listening.
135*d1ae37dcSKuniyuki Iwashima	# Sometimes client fails without this sleep.
136*d1ae37dcSKuniyuki Iwashima	sleep 1
137*d1ae37dcSKuniyuki Iwashima
138*d1ae37dcSKuniyuki Iwashima	ip netns exec "$client" \
139*d1ae37dcSKuniyuki Iwashima		socat -u /dev/zero TCP:"$server_ip":8080,pf="$pf",bind="$client_ip" &
140*d1ae37dcSKuniyuki Iwashima	client_pid=$!
141*d1ae37dcSKuniyuki Iwashima
142*d1ae37dcSKuniyuki Iwashima	# To capture enough packets.
143*d1ae37dcSKuniyuki Iwashima	sleep 3
144*d1ae37dcSKuniyuki Iwashima
145*d1ae37dcSKuniyuki Iwashima	tcpdump_stop veth-s1
146*d1ae37dcSKuniyuki Iwashima	tcpdump_stop veth-s2
147*d1ae37dcSKuniyuki Iwashima
148*d1ae37dcSKuniyuki Iwashima	pkts_s1=$(tcpdump_show veth-s1 | wc -l)
149*d1ae37dcSKuniyuki Iwashima	pkts_s2=$(tcpdump_show veth-s2 | wc -l)
150*d1ae37dcSKuniyuki Iwashima
151*d1ae37dcSKuniyuki Iwashima	tcpdump_cleanup veth-s1
152*d1ae37dcSKuniyuki Iwashima	tcpdump_cleanup veth-s2
153*d1ae37dcSKuniyuki Iwashima
154*d1ae37dcSKuniyuki Iwashima	# Detect the device chosen by the client
155*d1ae37dcSKuniyuki Iwashima	if [ "$pkts_s1" -gt "$pkts_s2" ]; then
156*d1ae37dcSKuniyuki Iwashima		veth_down=veth-s1
157*d1ae37dcSKuniyuki Iwashima		veth_up=veth-s2
158*d1ae37dcSKuniyuki Iwashima	else
159*d1ae37dcSKuniyuki Iwashima		veth_down=veth-s2
160*d1ae37dcSKuniyuki Iwashima		veth_up=veth-s1
161*d1ae37dcSKuniyuki Iwashima	fi
162*d1ae37dcSKuniyuki Iwashima
163*d1ae37dcSKuniyuki Iwashima	# Taking down $veth_down causes its peer to lose carrier,
164*d1ae37dcSKuniyuki Iwashima	# triggering NETDEV_CHANGE.  This flags RTNH_F_LINKDOWN
165*d1ae37dcSKuniyuki Iwashima	# and bumps the sernum for the route associated with that
166*d1ae37dcSKuniyuki Iwashima	# peer, invalidating the cached dst in the TCP socket.
167*d1ae37dcSKuniyuki Iwashima	#
168*d1ae37dcSKuniyuki Iwashima	# Consequently, sk_dst_check() fails, forcing the subsequent
169*d1ae37dcSKuniyuki Iwashima	# lookup to select the remaining healthy route via $veth_up.
170*d1ae37dcSKuniyuki Iwashima	ip -n "$server" link set "$veth_down" down
171*d1ae37dcSKuniyuki Iwashima
172*d1ae37dcSKuniyuki Iwashima	tcpdump_start "$veth_up" "$server"
173*d1ae37dcSKuniyuki Iwashima
174*d1ae37dcSKuniyuki Iwashima	# To capture enough packets.
175*d1ae37dcSKuniyuki Iwashima	sleep  3
176*d1ae37dcSKuniyuki Iwashima
177*d1ae37dcSKuniyuki Iwashima	tcpdump_stop "$veth_up"
178*d1ae37dcSKuniyuki Iwashima
179*d1ae37dcSKuniyuki Iwashima	kill -9 "$client_pid" > /dev/null 2>&1
180*d1ae37dcSKuniyuki Iwashima	kill -9 "$server_pid" > /dev/null 2>&1
181*d1ae37dcSKuniyuki Iwashima	wait 2> /dev/null
182*d1ae37dcSKuniyuki Iwashima
183*d1ae37dcSKuniyuki Iwashima	pkts=$(tcpdump_show $veth_up | wc -l)
184*d1ae37dcSKuniyuki Iwashima
185*d1ae37dcSKuniyuki Iwashima	tcpdump_cleanup "$veth_up"
186*d1ae37dcSKuniyuki Iwashima
187*d1ae37dcSKuniyuki Iwashima	if [ "$pkts" -lt 1000 ]; then
188*d1ae37dcSKuniyuki Iwashima		RET=$ksft_fail
189*d1ae37dcSKuniyuki Iwashima	fi
190*d1ae37dcSKuniyuki Iwashima}
191*d1ae37dcSKuniyuki Iwashima
192*d1ae37dcSKuniyuki Iwashimatest_ipv4()
193*d1ae37dcSKuniyuki Iwashima{
194*d1ae37dcSKuniyuki Iwashima	setup
195*d1ae37dcSKuniyuki Iwashima	tcp_ecmp_failover IPv4 $SERVER_IP $CLIENT_IP
196*d1ae37dcSKuniyuki Iwashima	log_test "TCP IPv4 failover"
197*d1ae37dcSKuniyuki Iwashima	cleanup
198*d1ae37dcSKuniyuki Iwashima}
199*d1ae37dcSKuniyuki Iwashima
200*d1ae37dcSKuniyuki Iwashimatest_ipv6()
201*d1ae37dcSKuniyuki Iwashima{
202*d1ae37dcSKuniyuki Iwashima	setup
203*d1ae37dcSKuniyuki Iwashima	tcp_ecmp_failover IPv6 "[$SERVER_IP6]" "[$CLIENT_IP6]"
204*d1ae37dcSKuniyuki Iwashima	log_test "TCP IPv6 failover"
205*d1ae37dcSKuniyuki Iwashima	cleanup
206*d1ae37dcSKuniyuki Iwashima}
207*d1ae37dcSKuniyuki Iwashima
208*d1ae37dcSKuniyuki Iwashimarequire_command socat
209*d1ae37dcSKuniyuki Iwashimarequire_command tcpdump
210*d1ae37dcSKuniyuki Iwashima
211*d1ae37dcSKuniyuki Iwashimatrap cleanup EXIT
212*d1ae37dcSKuniyuki Iwashima
213*d1ae37dcSKuniyuki Iwashimatest_ipv4
214*d1ae37dcSKuniyuki Iwashimatest_ipv6
215*d1ae37dcSKuniyuki Iwashima
216*d1ae37dcSKuniyuki Iwashimaexit "$EXIT_STATUS"
217