xref: /linux/tools/testing/selftests/net/pmtu.sh (revision 95298d63c67673c654c08952672d016212b26054)
1#!/bin/sh
2# SPDX-License-Identifier: GPL-2.0
3#
4# Check that route PMTU values match expectations, and that initial device MTU
5# values are assigned correctly
6#
7# Tests currently implemented:
8#
9# - pmtu_ipv4
10#	Set up two namespaces, A and B, with two paths between them over routers
11#	R1 and R2 (also implemented with namespaces), with different MTUs:
12#
13#	  segment a_r1    segment b_r1		a_r1: 2000
14#	.--------------R1--------------.	b_r1: 1400
15#	A                               B	a_r2: 2000
16#	'--------------R2--------------'	b_r2: 1500
17#	  segment a_r2    segment b_r2
18#
19#	Check that PMTU exceptions with the correct PMTU are created. Then
20#	decrease and increase the MTU of the local link for one of the paths,
21#	A to R1, checking that route exception PMTU changes accordingly over
22#	this path. Also check that locked exceptions are created when an ICMP
23#	message advertising a PMTU smaller than net.ipv4.route.min_pmtu is
24#	received
25#
26# - pmtu_ipv6
27#	Same as pmtu_ipv4, except for locked PMTU tests, using IPv6
28#
29# - pmtu_ipv4_vxlan4_exception
30#	Set up the same network topology as pmtu_ipv4, create a VXLAN tunnel
31#	over IPv4 between A and B, routed via R1. On the link between R1 and B,
32#	set a MTU lower than the VXLAN MTU and the MTU on the link between A and
33#	R1. Send IPv4 packets, exceeding the MTU between R1 and B, over VXLAN
34#	from A to B and check that the PMTU exception is created with the right
35#	value on A
36#
37# - pmtu_ipv6_vxlan4_exception
38#	Same as pmtu_ipv4_vxlan4_exception, but send IPv6 packets from A to B
39#
40# - pmtu_ipv4_vxlan6_exception
41#	Same as pmtu_ipv4_vxlan4_exception, but use IPv6 transport from A to B
42#
43# - pmtu_ipv6_vxlan6_exception
44#	Same as pmtu_ipv4_vxlan6_exception, but send IPv6 packets from A to B
45#
46# - pmtu_ipv4_geneve4_exception
47#	Same as pmtu_ipv4_vxlan4_exception, but using a GENEVE tunnel instead of
48#	VXLAN
49#
50# - pmtu_ipv6_geneve4_exception
51#	Same as pmtu_ipv6_vxlan4_exception, but using a GENEVE tunnel instead of
52#	VXLAN
53#
54# - pmtu_ipv4_geneve6_exception
55#	Same as pmtu_ipv4_vxlan6_exception, but using a GENEVE tunnel instead of
56#	VXLAN
57#
58# - pmtu_ipv6_geneve6_exception
59#	Same as pmtu_ipv6_vxlan6_exception, but using a GENEVE tunnel instead of
60#	VXLAN
61#
62# - pmtu_ipv{4,6}_fou{4,6}_exception
63#	Same as pmtu_ipv4_vxlan4, but using a direct IPv4/IPv6 encapsulation
64#	(FoU) over IPv4/IPv6, instead of VXLAN
65#
66# - pmtu_ipv{4,6}_fou{4,6}_exception
67#	Same as pmtu_ipv4_vxlan4, but using a generic UDP IPv4/IPv6
68#	encapsulation (GUE) over IPv4/IPv6, instead of VXLAN
69#
70# - pmtu_ipv{4,6}_ipv{4,6}_exception
71#	Same as pmtu_ipv4_vxlan4, but using a IPv4/IPv6 tunnel over IPv4/IPv6,
72#	instead of VXLAN
73#
74# - pmtu_vti4_exception
75#	Set up vti tunnel on top of veth, with xfrm states and policies, in two
76#	namespaces with matching endpoints. Check that route exception is not
77#	created if link layer MTU is not exceeded, then exceed it and check that
78#	exception is created with the expected PMTU. The approach described
79#	below for IPv6 doesn't apply here, because, on IPv4, administrative MTU
80#	changes alone won't affect PMTU
81#
82# - pmtu_vti6_exception
83#	Set up vti6 tunnel on top of veth, with xfrm states and policies, in two
84#	namespaces with matching endpoints. Check that route exception is
85#	created by exceeding link layer MTU with ping to other endpoint. Then
86#	decrease and increase MTU of tunnel, checking that route exception PMTU
87#	changes accordingly
88#
89# - pmtu_vti4_default_mtu
90#	Set up vti4 tunnel on top of veth, in two namespaces with matching
91#	endpoints. Check that MTU assigned to vti interface is the MTU of the
92#	lower layer (veth) minus additional lower layer headers (zero, for veth)
93#	minus IPv4 header length
94#
95# - pmtu_vti6_default_mtu
96#	Same as above, for IPv6
97#
98# - pmtu_vti4_link_add_mtu
99#	Set up vti4 interface passing MTU value at link creation, check MTU is
100#	configured, and that link is not created with invalid MTU values
101#
102# - pmtu_vti6_link_add_mtu
103#	Same as above, for IPv6
104#
105# - pmtu_vti6_link_change_mtu
106#	Set up two dummy interfaces with different MTUs, create a vti6 tunnel
107#	and check that configured MTU is used on link creation and changes, and
108#	that MTU is properly calculated instead when MTU is not configured from
109#	userspace
110#
111# - cleanup_ipv4_exception
112#	Similar to pmtu_ipv4_vxlan4_exception, but explicitly generate PMTU
113#	exceptions on multiple CPUs and check that the veth device tear-down
114# 	happens in a timely manner
115#
116# - cleanup_ipv6_exception
117#	Same as above, but use IPv6 transport from A to B
118#
119# - list_flush_ipv4_exception
120#	Using the same topology as in pmtu_ipv4, create exceptions, and check
121#	they are shown when listing exception caches, gone after flushing them
122#
123# - list_flush_ipv6_exception
124#	Using the same topology as in pmtu_ipv6, create exceptions, and check
125#	they are shown when listing exception caches, gone after flushing them
126
127
128# Kselftest framework requirement - SKIP code is 4.
129ksft_skip=4
130
131PAUSE_ON_FAIL=no
132VERBOSE=0
133TRACING=0
134
135# Some systems don't have a ping6 binary anymore
136which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
137
138#               Name                          Description                  re-run with nh
139tests="
140	pmtu_ipv4_exception		ipv4: PMTU exceptions			1
141	pmtu_ipv6_exception		ipv6: PMTU exceptions			1
142	pmtu_ipv4_vxlan4_exception	IPv4 over vxlan4: PMTU exceptions	1
143	pmtu_ipv6_vxlan4_exception	IPv6 over vxlan4: PMTU exceptions	1
144	pmtu_ipv4_vxlan6_exception	IPv4 over vxlan6: PMTU exceptions	1
145	pmtu_ipv6_vxlan6_exception	IPv6 over vxlan6: PMTU exceptions	1
146	pmtu_ipv4_geneve4_exception	IPv4 over geneve4: PMTU exceptions	1
147	pmtu_ipv6_geneve4_exception	IPv6 over geneve4: PMTU exceptions	1
148	pmtu_ipv4_geneve6_exception	IPv4 over geneve6: PMTU exceptions	1
149	pmtu_ipv6_geneve6_exception	IPv6 over geneve6: PMTU exceptions	1
150	pmtu_ipv4_fou4_exception	IPv4 over fou4: PMTU exceptions		1
151	pmtu_ipv6_fou4_exception	IPv6 over fou4: PMTU exceptions		1
152	pmtu_ipv4_fou6_exception	IPv4 over fou6: PMTU exceptions		1
153	pmtu_ipv6_fou6_exception	IPv6 over fou6: PMTU exceptions		1
154	pmtu_ipv4_gue4_exception	IPv4 over gue4: PMTU exceptions		1
155	pmtu_ipv6_gue4_exception	IPv6 over gue4: PMTU exceptions		1
156	pmtu_ipv4_gue6_exception	IPv4 over gue6: PMTU exceptions		1
157	pmtu_ipv6_gue6_exception	IPv6 over gue6: PMTU exceptions		1
158	pmtu_ipv4_ipv4_exception	IPv4 over IPv4: PMTU exceptions		1
159	pmtu_ipv6_ipv4_exception	IPv6 over IPv4: PMTU exceptions		1
160	pmtu_ipv4_ipv6_exception	IPv4 over IPv6: PMTU exceptions		1
161	pmtu_ipv6_ipv6_exception	IPv6 over IPv6: PMTU exceptions		1
162	pmtu_vti6_exception		vti6: PMTU exceptions			0
163	pmtu_vti4_exception		vti4: PMTU exceptions			0
164	pmtu_vti4_default_mtu		vti4: default MTU assignment		0
165	pmtu_vti6_default_mtu		vti6: default MTU assignment		0
166	pmtu_vti4_link_add_mtu		vti4: MTU setting on link creation	0
167	pmtu_vti6_link_add_mtu		vti6: MTU setting on link creation	0
168	pmtu_vti6_link_change_mtu	vti6: MTU changes on link changes	0
169	cleanup_ipv4_exception		ipv4: cleanup of cached exceptions	1
170	cleanup_ipv6_exception		ipv6: cleanup of cached exceptions	1
171	list_flush_ipv4_exception	ipv4: list and flush cached exceptions	1
172	list_flush_ipv6_exception	ipv6: list and flush cached exceptions	1"
173
174NS_A="ns-A"
175NS_B="ns-B"
176NS_R1="ns-R1"
177NS_R2="ns-R2"
178ns_a="ip netns exec ${NS_A}"
179ns_b="ip netns exec ${NS_B}"
180ns_r1="ip netns exec ${NS_R1}"
181ns_r2="ip netns exec ${NS_R2}"
182
183# Addressing and routing for tests with routers: four network segments, with
184# index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an
185# identifier ID, which is 1 for hosts (A and B), 2 for routers (R1 and R2).
186# Addresses are:
187# - IPv4: PREFIX4.SEGMENT.ID (/24)
188# - IPv6: PREFIX6:SEGMENT::ID (/64)
189prefix4="10.0"
190prefix6="fc00"
191a_r1=1
192a_r2=2
193b_r1=3
194b_r2=4
195#	ns	peer	segment
196routing_addrs="
197	A	R1	${a_r1}
198	A	R2	${a_r2}
199	B	R1	${b_r1}
200	B	R2	${b_r2}
201"
202# Traffic from A to B goes through R1 by default, and through R2, if destined to
203# B's address on the b_r2 segment.
204# Traffic from B to A goes through R1.
205#	ns	destination		gateway
206routes="
207	A	default			${prefix4}.${a_r1}.2
208	A	${prefix4}.${b_r2}.1	${prefix4}.${a_r2}.2
209	B	default			${prefix4}.${b_r1}.2
210
211	A	default			${prefix6}:${a_r1}::2
212	A	${prefix6}:${b_r2}::1	${prefix6}:${a_r2}::2
213	B	default			${prefix6}:${b_r1}::2
214"
215
216USE_NH="no"
217#	ns	family	nh id	   destination		gateway
218nexthops="
219	A	4	41	${prefix4}.${a_r1}.2	veth_A-R1
220	A	4	42	${prefix4}.${a_r2}.2	veth_A-R2
221	B	4	41	${prefix4}.${b_r1}.2	veth_B-R1
222
223	A	6	61	${prefix6}:${a_r1}::2	veth_A-R1
224	A	6	62	${prefix6}:${a_r2}::2	veth_A-R2
225	B	6	61	${prefix6}:${b_r1}::2	veth_B-R1
226"
227
228# nexthop id correlates to id in nexthops config above
229#	ns    family	prefix			nh id
230routes_nh="
231	A	4	default			41
232	A	4	${prefix4}.${b_r2}.1	42
233	B	4	default			41
234
235	A	6	default			61
236	A	6	${prefix6}:${b_r2}::1	62
237	B	6	default			61
238"
239
240veth4_a_addr="192.168.1.1"
241veth4_b_addr="192.168.1.2"
242veth4_mask="24"
243veth6_a_addr="fd00:1::a"
244veth6_b_addr="fd00:1::b"
245veth6_mask="64"
246
247tunnel4_a_addr="192.168.2.1"
248tunnel4_b_addr="192.168.2.2"
249tunnel4_mask="24"
250tunnel6_a_addr="fd00:2::a"
251tunnel6_b_addr="fd00:2::b"
252tunnel6_mask="64"
253
254dummy6_0_prefix="fc00:1000::"
255dummy6_1_prefix="fc00:1001::"
256dummy6_mask="64"
257
258err_buf=
259tcpdump_pids=
260
261err() {
262	err_buf="${err_buf}${1}
263"
264}
265
266err_flush() {
267	echo -n "${err_buf}"
268	err_buf=
269}
270
271run_cmd() {
272	cmd="$*"
273
274	if [ "$VERBOSE" = "1" ]; then
275		printf "    COMMAND: $cmd\n"
276	fi
277
278	out="$($cmd 2>&1)"
279	rc=$?
280	if [ "$VERBOSE" = "1" -a -n "$out" ]; then
281		echo "    $out"
282		echo
283	fi
284
285	return $rc
286}
287
288# Find the auto-generated name for this namespace
289nsname() {
290	eval echo \$NS_$1
291}
292
293setup_fou_or_gue() {
294	outer="${1}"
295	inner="${2}"
296	encap="${3}"
297
298	if [ "${outer}" = "4" ]; then
299		modprobe fou || return 2
300		a_addr="${prefix4}.${a_r1}.1"
301		b_addr="${prefix4}.${b_r1}.1"
302		if [ "${inner}" = "4" ]; then
303			type="ipip"
304			ipproto="4"
305		else
306			type="sit"
307			ipproto="41"
308		fi
309	else
310		modprobe fou6 || return 2
311		a_addr="${prefix6}:${a_r1}::1"
312		b_addr="${prefix6}:${b_r1}::1"
313		if [ "${inner}" = "4" ]; then
314			type="ip6tnl"
315			mode="mode ipip6"
316			ipproto="4 -6"
317		else
318			type="ip6tnl"
319			mode="mode ip6ip6"
320			ipproto="41 -6"
321		fi
322	fi
323
324	run_cmd ${ns_a} ip fou add port 5555 ipproto ${ipproto} || return 2
325	run_cmd ${ns_a} ip link add ${encap}_a type ${type} ${mode} local ${a_addr} remote ${b_addr} encap ${encap} encap-sport auto encap-dport 5556 || return 2
326
327	run_cmd ${ns_b} ip fou add port 5556 ipproto ${ipproto}
328	run_cmd ${ns_b} ip link add ${encap}_b type ${type} ${mode} local ${b_addr} remote ${a_addr} encap ${encap} encap-sport auto encap-dport 5555
329
330	if [ "${inner}" = "4" ]; then
331		run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${encap}_a
332		run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${encap}_b
333	else
334		run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${encap}_a
335		run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${encap}_b
336	fi
337
338	run_cmd ${ns_a} ip link set ${encap}_a up
339	run_cmd ${ns_b} ip link set ${encap}_b up
340}
341
342setup_fou44() {
343	setup_fou_or_gue 4 4 fou
344}
345
346setup_fou46() {
347	setup_fou_or_gue 4 6 fou
348}
349
350setup_fou64() {
351	setup_fou_or_gue 6 4 fou
352}
353
354setup_fou66() {
355	setup_fou_or_gue 6 6 fou
356}
357
358setup_gue44() {
359	setup_fou_or_gue 4 4 gue
360}
361
362setup_gue46() {
363	setup_fou_or_gue 4 6 gue
364}
365
366setup_gue64() {
367	setup_fou_or_gue 6 4 gue
368}
369
370setup_gue66() {
371	setup_fou_or_gue 6 6 gue
372}
373
374setup_ipvX_over_ipvY() {
375	inner=${1}
376	outer=${2}
377
378	if [ "${outer}" -eq 4 ]; then
379		a_addr="${prefix4}.${a_r1}.1"
380		b_addr="${prefix4}.${b_r1}.1"
381		if [ "${inner}" -eq 4 ]; then
382			type="ipip"
383			mode="ipip"
384		else
385			type="sit"
386			mode="ip6ip"
387		fi
388	else
389		a_addr="${prefix6}:${a_r1}::1"
390		b_addr="${prefix6}:${b_r1}::1"
391		type="ip6tnl"
392		if [ "${inner}" -eq 4 ]; then
393			mode="ipip6"
394		else
395			mode="ip6ip6"
396		fi
397	fi
398
399	run_cmd ${ns_a} ip link add ip_a type ${type} local ${a_addr} remote ${b_addr} mode ${mode} || return 2
400	run_cmd ${ns_b} ip link add ip_b type ${type} local ${b_addr} remote ${a_addr} mode ${mode}
401
402	run_cmd ${ns_a} ip link set ip_a up
403	run_cmd ${ns_b} ip link set ip_b up
404
405	if [ "${inner}" = "4" ]; then
406		run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ip_a
407		run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ip_b
408	else
409		run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ip_a
410		run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ip_b
411	fi
412}
413
414setup_ip4ip4() {
415	setup_ipvX_over_ipvY 4 4
416}
417
418setup_ip6ip4() {
419	setup_ipvX_over_ipvY 6 4
420}
421
422setup_ip4ip6() {
423	setup_ipvX_over_ipvY 4 6
424}
425
426setup_ip6ip6() {
427	setup_ipvX_over_ipvY 6 6
428}
429
430setup_namespaces() {
431	for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do
432		ip netns add ${n} || return 1
433
434		# Disable DAD, so that we don't have to wait to use the
435		# configured IPv6 addresses
436		ip netns exec ${n} sysctl -q net/ipv6/conf/default/accept_dad=0
437	done
438}
439
440setup_veth() {
441	run_cmd ${ns_a} ip link add veth_a type veth peer name veth_b || return 1
442	run_cmd ${ns_a} ip link set veth_b netns ${NS_B}
443
444	run_cmd ${ns_a} ip addr add ${veth4_a_addr}/${veth4_mask} dev veth_a
445	run_cmd ${ns_b} ip addr add ${veth4_b_addr}/${veth4_mask} dev veth_b
446
447	run_cmd ${ns_a} ip addr add ${veth6_a_addr}/${veth6_mask} dev veth_a
448	run_cmd ${ns_b} ip addr add ${veth6_b_addr}/${veth6_mask} dev veth_b
449
450	run_cmd ${ns_a} ip link set veth_a up
451	run_cmd ${ns_b} ip link set veth_b up
452}
453
454setup_vti() {
455	proto=${1}
456	veth_a_addr="${2}"
457	veth_b_addr="${3}"
458	vti_a_addr="${4}"
459	vti_b_addr="${5}"
460	vti_mask=${6}
461
462	[ ${proto} -eq 6 ] && vti_type="vti6" || vti_type="vti"
463
464	run_cmd ${ns_a} ip link add vti${proto}_a type ${vti_type} local ${veth_a_addr} remote ${veth_b_addr} key 10 || return 1
465	run_cmd ${ns_b} ip link add vti${proto}_b type ${vti_type} local ${veth_b_addr} remote ${veth_a_addr} key 10
466
467	run_cmd ${ns_a} ip addr add ${vti_a_addr}/${vti_mask} dev vti${proto}_a
468	run_cmd ${ns_b} ip addr add ${vti_b_addr}/${vti_mask} dev vti${proto}_b
469
470	run_cmd ${ns_a} ip link set vti${proto}_a up
471	run_cmd ${ns_b} ip link set vti${proto}_b up
472}
473
474setup_vti4() {
475	setup_vti 4 ${veth4_a_addr} ${veth4_b_addr} ${tunnel4_a_addr} ${tunnel4_b_addr} ${tunnel4_mask}
476}
477
478setup_vti6() {
479	setup_vti 6 ${veth6_a_addr} ${veth6_b_addr} ${tunnel6_a_addr} ${tunnel6_b_addr} ${tunnel6_mask}
480}
481
482setup_vxlan_or_geneve() {
483	type="${1}"
484	a_addr="${2}"
485	b_addr="${3}"
486	opts="${4}"
487
488	if [ "${type}" = "vxlan" ]; then
489		opts="${opts} ttl 64 dstport 4789"
490		opts_a="local ${a_addr}"
491		opts_b="local ${b_addr}"
492	else
493		opts_a=""
494		opts_b=""
495	fi
496
497	run_cmd ${ns_a} ip link add ${type}_a type ${type} id 1 ${opts_a} remote ${b_addr} ${opts} || return 1
498	run_cmd ${ns_b} ip link add ${type}_b type ${type} id 1 ${opts_b} remote ${a_addr} ${opts}
499
500	run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${type}_a
501	run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b
502
503	run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${type}_a
504	run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${type}_b
505
506	run_cmd ${ns_a} ip link set ${type}_a up
507	run_cmd ${ns_b} ip link set ${type}_b up
508}
509
510setup_geneve4() {
511	setup_vxlan_or_geneve geneve ${prefix4}.${a_r1}.1  ${prefix4}.${b_r1}.1  "df set"
512}
513
514setup_vxlan4() {
515	setup_vxlan_or_geneve vxlan  ${prefix4}.${a_r1}.1  ${prefix4}.${b_r1}.1  "df set"
516}
517
518setup_geneve6() {
519	setup_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
520}
521
522setup_vxlan6() {
523	setup_vxlan_or_geneve vxlan  ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
524}
525
526setup_xfrm() {
527	proto=${1}
528	veth_a_addr="${2}"
529	veth_b_addr="${3}"
530
531	run_cmd ${ns_a} ip -${proto} xfrm state add src ${veth_a_addr} dst ${veth_b_addr} spi 0x1000 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel || return 1
532	run_cmd ${ns_a} ip -${proto} xfrm state add src ${veth_b_addr} dst ${veth_a_addr} spi 0x1001 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
533	run_cmd ${ns_a} ip -${proto} xfrm policy add dir out mark 10 tmpl src ${veth_a_addr} dst ${veth_b_addr} proto esp mode tunnel
534	run_cmd ${ns_a} ip -${proto} xfrm policy add dir in mark 10 tmpl src ${veth_b_addr} dst ${veth_a_addr} proto esp mode tunnel
535
536	run_cmd ${ns_b} ip -${proto} xfrm state add src ${veth_a_addr} dst ${veth_b_addr} spi 0x1000 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
537	run_cmd ${ns_b} ip -${proto} xfrm state add src ${veth_b_addr} dst ${veth_a_addr} spi 0x1001 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
538	run_cmd ${ns_b} ip -${proto} xfrm policy add dir out mark 10 tmpl src ${veth_b_addr} dst ${veth_a_addr} proto esp mode tunnel
539	run_cmd ${ns_b} ip -${proto} xfrm policy add dir in mark 10 tmpl src ${veth_a_addr} dst ${veth_b_addr} proto esp mode tunnel
540}
541
542setup_xfrm4() {
543	setup_xfrm 4 ${veth4_a_addr} ${veth4_b_addr}
544}
545
546setup_xfrm6() {
547	setup_xfrm 6 ${veth6_a_addr} ${veth6_b_addr}
548}
549
550setup_routing_old() {
551	for i in ${routes}; do
552		[ "${ns}" = "" ]	&& ns="${i}"		&& continue
553		[ "${addr}" = "" ]	&& addr="${i}"		&& continue
554		[ "${gw}" = "" ]	&& gw="${i}"
555
556		ns_name="$(nsname ${ns})"
557
558		ip -n ${ns_name} route add ${addr} via ${gw}
559
560		ns=""; addr=""; gw=""
561	done
562}
563
564setup_routing_new() {
565	for i in ${nexthops}; do
566		[ "${ns}" = "" ]	&& ns="${i}"		&& continue
567		[ "${fam}" = "" ]	&& fam="${i}"		&& continue
568		[ "${nhid}" = "" ]	&& nhid="${i}"		&& continue
569		[ "${gw}" = "" ]	&& gw="${i}"		&& continue
570		[ "${dev}" = "" ]	&& dev="${i}"
571
572		ns_name="$(nsname ${ns})"
573
574		ip -n ${ns_name} -${fam} nexthop add id ${nhid} via ${gw} dev ${dev}
575
576		ns=""; fam=""; nhid=""; gw=""; dev=""
577
578	done
579
580	for i in ${routes_nh}; do
581		[ "${ns}" = "" ]	&& ns="${i}"		&& continue
582		[ "${fam}" = "" ]	&& fam="${i}"		&& continue
583		[ "${addr}" = "" ]	&& addr="${i}"		&& continue
584		[ "${nhid}" = "" ]	&& nhid="${i}"
585
586		ns_name="$(nsname ${ns})"
587
588		ip -n ${ns_name} -${fam} route add ${addr} nhid ${nhid}
589
590		ns=""; fam=""; addr=""; nhid=""
591	done
592}
593
594setup_routing() {
595	for i in ${NS_R1} ${NS_R2}; do
596		ip netns exec ${i} sysctl -q net/ipv4/ip_forward=1
597		ip netns exec ${i} sysctl -q net/ipv6/conf/all/forwarding=1
598	done
599
600	for i in ${routing_addrs}; do
601		[ "${ns}" = "" ]	&& ns="${i}"		&& continue
602		[ "${peer}" = "" ]	&& peer="${i}"		&& continue
603		[ "${segment}" = "" ]	&& segment="${i}"
604
605		ns_name="$(nsname ${ns})"
606		peer_name="$(nsname ${peer})"
607		if="veth_${ns}-${peer}"
608		ifpeer="veth_${peer}-${ns}"
609
610		# Create veth links
611		ip link add ${if} up netns ${ns_name} type veth peer name ${ifpeer} netns ${peer_name} || return 1
612		ip -n ${peer_name} link set dev ${ifpeer} up
613
614		# Add addresses
615		ip -n ${ns_name}   addr add ${prefix4}.${segment}.1/24  dev ${if}
616		ip -n ${ns_name}   addr add ${prefix6}:${segment}::1/64 dev ${if}
617
618		ip -n ${peer_name} addr add ${prefix4}.${segment}.2/24  dev ${ifpeer}
619		ip -n ${peer_name} addr add ${prefix6}:${segment}::2/64 dev ${ifpeer}
620
621		ns=""; peer=""; segment=""
622	done
623
624	if [ "$USE_NH" = "yes" ]; then
625		setup_routing_new
626	else
627		setup_routing_old
628	fi
629
630	return 0
631}
632
633setup() {
634	[ "$(id -u)" -ne 0 ] && echo "  need to run as root" && return $ksft_skip
635
636	cleanup
637	for arg do
638		eval setup_${arg} || { echo "  ${arg} not supported"; return 1; }
639	done
640}
641
642trace() {
643	[ $TRACING -eq 0 ] && return
644
645	for arg do
646		[ "${ns_cmd}" = "" ] && ns_cmd="${arg}" && continue
647		${ns_cmd} tcpdump -s 0 -i "${arg}" -w "${name}_${arg}.pcap" 2> /dev/null &
648		tcpdump_pids="${tcpdump_pids} $!"
649		ns_cmd=
650	done
651	sleep 1
652}
653
654cleanup() {
655	for pid in ${tcpdump_pids}; do
656		kill ${pid}
657	done
658	tcpdump_pids=
659
660	for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do
661		ip netns del ${n} 2> /dev/null
662	done
663}
664
665mtu() {
666	ns_cmd="${1}"
667	dev="${2}"
668	mtu="${3}"
669
670	${ns_cmd} ip link set dev ${dev} mtu ${mtu}
671}
672
673mtu_parse() {
674	input="${1}"
675
676	next=0
677	for i in ${input}; do
678		[ ${next} -eq 1 -a "${i}" = "lock" ] && next=2 && continue
679		[ ${next} -eq 1 ] && echo "${i}" && return
680		[ ${next} -eq 2 ] && echo "lock ${i}" && return
681		[ "${i}" = "mtu" ] && next=1
682	done
683}
684
685link_get() {
686	ns_cmd="${1}"
687	name="${2}"
688
689	${ns_cmd} ip link show dev "${name}"
690}
691
692link_get_mtu() {
693	ns_cmd="${1}"
694	name="${2}"
695
696	mtu_parse "$(link_get "${ns_cmd}" ${name})"
697}
698
699route_get_dst_exception() {
700	ns_cmd="${1}"
701	dst="${2}"
702
703	${ns_cmd} ip route get "${dst}"
704}
705
706route_get_dst_pmtu_from_exception() {
707	ns_cmd="${1}"
708	dst="${2}"
709
710	mtu_parse "$(route_get_dst_exception "${ns_cmd}" ${dst})"
711}
712
713check_pmtu_value() {
714	expected="${1}"
715	value="${2}"
716	event="${3}"
717
718	[ "${expected}" = "any" ] && [ -n "${value}" ] && return 0
719	[ "${value}" = "${expected}" ] && return 0
720	[ -z "${value}" ] &&    err "  PMTU exception wasn't created after ${event}" && return 1
721	[ -z "${expected}" ] && err "  PMTU exception shouldn't exist after ${event}" && return 1
722	err "  found PMTU exception with incorrect MTU ${value}, expected ${expected}, after ${event}"
723	return 1
724}
725
726test_pmtu_ipvX() {
727	family=${1}
728
729	setup namespaces routing || return 2
730	trace "${ns_a}"  veth_A-R1    "${ns_r1}" veth_R1-A \
731	      "${ns_r1}" veth_R1-B    "${ns_b}"  veth_B-R1 \
732	      "${ns_a}"  veth_A-R2    "${ns_r2}" veth_R2-A \
733	      "${ns_r2}" veth_R2-B    "${ns_b}"  veth_B-R2
734
735	if [ ${family} -eq 4 ]; then
736		ping=ping
737		dst1="${prefix4}.${b_r1}.1"
738		dst2="${prefix4}.${b_r2}.1"
739	else
740		ping=${ping6}
741		dst1="${prefix6}:${b_r1}::1"
742		dst2="${prefix6}:${b_r2}::1"
743	fi
744
745	# Set up initial MTU values
746	mtu "${ns_a}"  veth_A-R1 2000
747	mtu "${ns_r1}" veth_R1-A 2000
748	mtu "${ns_r1}" veth_R1-B 1400
749	mtu "${ns_b}"  veth_B-R1 1400
750
751	mtu "${ns_a}"  veth_A-R2 2000
752	mtu "${ns_r2}" veth_R2-A 2000
753	mtu "${ns_r2}" veth_R2-B 1500
754	mtu "${ns_b}"  veth_B-R2 1500
755
756	# Create route exceptions
757	run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst1}
758	run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst2}
759
760	# Check that exceptions have been created with the correct PMTU
761	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1})"
762	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
763	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
764	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
765
766	# Decrease local MTU below PMTU, check for PMTU decrease in route exception
767	mtu "${ns_a}"  veth_A-R1 1300
768	mtu "${ns_r1}" veth_R1-A 1300
769	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1})"
770	check_pmtu_value "1300" "${pmtu_1}" "decreasing local MTU" || return 1
771	# Second exception shouldn't be modified
772	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
773	check_pmtu_value "1500" "${pmtu_2}" "changing local MTU on a link not on this path" || return 1
774
775	# Increase MTU, check for PMTU increase in route exception
776	mtu "${ns_a}"  veth_A-R1 1700
777	mtu "${ns_r1}" veth_R1-A 1700
778	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1})"
779	check_pmtu_value "1700" "${pmtu_1}" "increasing local MTU" || return 1
780	# Second exception shouldn't be modified
781	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
782	check_pmtu_value "1500" "${pmtu_2}" "changing local MTU on a link not on this path" || return 1
783
784	# Skip PMTU locking tests for IPv6
785	[ $family -eq 6 ] && return 0
786
787	# Decrease remote MTU on path via R2, get new exception
788	mtu "${ns_r2}" veth_R2-B 400
789	mtu "${ns_b}"  veth_B-R2 400
790	run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1400 ${dst2}
791	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
792	check_pmtu_value "lock 552" "${pmtu_2}" "exceeding MTU, with MTU < min_pmtu" || return 1
793
794	# Decrease local MTU below PMTU
795	mtu "${ns_a}"  veth_A-R2 500
796	mtu "${ns_r2}" veth_R2-A 500
797	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
798	check_pmtu_value "500" "${pmtu_2}" "decreasing local MTU" || return 1
799
800	# Increase local MTU
801	mtu "${ns_a}"  veth_A-R2 1500
802	mtu "${ns_r2}" veth_R2-A 1500
803	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
804	check_pmtu_value "1500" "${pmtu_2}" "increasing local MTU" || return 1
805
806	# Get new exception
807	run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1400 ${dst2}
808	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
809	check_pmtu_value "lock 552" "${pmtu_2}" "exceeding MTU, with MTU < min_pmtu" || return 1
810}
811
812test_pmtu_ipv4_exception() {
813	test_pmtu_ipvX 4
814}
815
816test_pmtu_ipv6_exception() {
817	test_pmtu_ipvX 6
818}
819
820test_pmtu_ipvX_over_vxlanY_or_geneveY_exception() {
821	type=${1}
822	family=${2}
823	outer_family=${3}
824	ll_mtu=4000
825
826	if [ ${outer_family} -eq 4 ]; then
827		setup namespaces routing ${type}4 || return 2
828		#                      IPv4 header   UDP header   VXLAN/GENEVE header   Ethernet header
829		exp_mtu=$((${ll_mtu} - 20          - 8          - 8                   - 14))
830	else
831		setup namespaces routing ${type}6 || return 2
832		#                      IPv6 header   UDP header   VXLAN/GENEVE header   Ethernet header
833		exp_mtu=$((${ll_mtu} - 40          - 8          - 8                   - 14))
834	fi
835
836	trace "${ns_a}" ${type}_a    "${ns_b}"  ${type}_b \
837	      "${ns_a}" veth_A-R1    "${ns_r1}" veth_R1-A \
838	      "${ns_b}" veth_B-R1    "${ns_r1}" veth_R1-B
839
840	if [ ${family} -eq 4 ]; then
841		ping=ping
842		dst=${tunnel4_b_addr}
843	else
844		ping=${ping6}
845		dst=${tunnel6_b_addr}
846	fi
847
848	# Create route exception by exceeding link layer MTU
849	mtu "${ns_a}"  veth_A-R1 $((${ll_mtu} + 1000))
850	mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
851	mtu "${ns_b}"  veth_B-R1 ${ll_mtu}
852	mtu "${ns_r1}" veth_R1-B ${ll_mtu}
853
854	mtu "${ns_a}" ${type}_a $((${ll_mtu} + 1000))
855	mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000))
856	run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst}
857
858	# Check that exception was created
859	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
860	check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ${type} interface"
861}
862
863test_pmtu_ipv4_vxlan4_exception() {
864	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  4 4
865}
866
867test_pmtu_ipv6_vxlan4_exception() {
868	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  6 4
869}
870
871test_pmtu_ipv4_geneve4_exception() {
872	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 4 4
873}
874
875test_pmtu_ipv6_geneve4_exception() {
876	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 6 4
877}
878
879test_pmtu_ipv4_vxlan6_exception() {
880	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  4 6
881}
882
883test_pmtu_ipv6_vxlan6_exception() {
884	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan  6 6
885}
886
887test_pmtu_ipv4_geneve6_exception() {
888	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 4 6
889}
890
891test_pmtu_ipv6_geneve6_exception() {
892	test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 6 6
893}
894
895test_pmtu_ipvX_over_fouY_or_gueY() {
896	inner_family=${1}
897	outer_family=${2}
898	encap=${3}
899	ll_mtu=4000
900
901	setup namespaces routing ${encap}${outer_family}${inner_family} || return 2
902	trace "${ns_a}" ${encap}_a   "${ns_b}"  ${encap}_b \
903	      "${ns_a}" veth_A-R1    "${ns_r1}" veth_R1-A \
904	      "${ns_b}" veth_B-R1    "${ns_r1}" veth_R1-B
905
906	if [ ${inner_family} -eq 4 ]; then
907		ping=ping
908		dst=${tunnel4_b_addr}
909	else
910		ping=${ping6}
911		dst=${tunnel6_b_addr}
912	fi
913
914	if [ "${encap}" = "gue" ]; then
915		encap_overhead=4
916	else
917		encap_overhead=0
918	fi
919
920	if [ ${outer_family} -eq 4 ]; then
921		#                      IPv4 header   UDP header
922		exp_mtu=$((${ll_mtu} - 20          - 8         - ${encap_overhead}))
923	else
924		#                      IPv6 header   Option 4   UDP header
925		exp_mtu=$((${ll_mtu} - 40          - 8        - 8       - ${encap_overhead}))
926	fi
927
928	# Create route exception by exceeding link layer MTU
929	mtu "${ns_a}"  veth_A-R1 $((${ll_mtu} + 1000))
930	mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
931	mtu "${ns_b}"  veth_B-R1 ${ll_mtu}
932	mtu "${ns_r1}" veth_R1-B ${ll_mtu}
933
934	mtu "${ns_a}" ${encap}_a $((${ll_mtu} + 1000))
935	mtu "${ns_b}" ${encap}_b $((${ll_mtu} + 1000))
936	run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst}
937
938	# Check that exception was created
939	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
940	check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ${encap} interface"
941}
942
943test_pmtu_ipv4_fou4_exception() {
944	test_pmtu_ipvX_over_fouY_or_gueY 4 4 fou
945}
946
947test_pmtu_ipv6_fou4_exception() {
948	test_pmtu_ipvX_over_fouY_or_gueY 6 4 fou
949}
950
951test_pmtu_ipv4_fou6_exception() {
952	test_pmtu_ipvX_over_fouY_or_gueY 4 6 fou
953}
954
955test_pmtu_ipv6_fou6_exception() {
956	test_pmtu_ipvX_over_fouY_or_gueY 6 6 fou
957}
958
959test_pmtu_ipv4_gue4_exception() {
960	test_pmtu_ipvX_over_fouY_or_gueY 4 4 gue
961}
962
963test_pmtu_ipv6_gue4_exception() {
964	test_pmtu_ipvX_over_fouY_or_gueY 6 4 gue
965}
966
967test_pmtu_ipv4_gue6_exception() {
968	test_pmtu_ipvX_over_fouY_or_gueY 4 6 gue
969}
970
971test_pmtu_ipv6_gue6_exception() {
972	test_pmtu_ipvX_over_fouY_or_gueY 6 6 gue
973}
974
975test_pmtu_ipvX_over_ipvY_exception() {
976	inner=${1}
977	outer=${2}
978	ll_mtu=4000
979
980	setup namespaces routing ip${inner}ip${outer} || return 2
981
982	trace "${ns_a}" ip_a         "${ns_b}"  ip_b  \
983	      "${ns_a}" veth_A-R1    "${ns_r1}" veth_R1-A \
984	      "${ns_b}" veth_B-R1    "${ns_r1}" veth_R1-B
985
986	if [ ${inner} -eq 4 ]; then
987		ping=ping
988		dst=${tunnel4_b_addr}
989	else
990		ping=${ping6}
991		dst=${tunnel6_b_addr}
992	fi
993
994	if [ ${outer} -eq 4 ]; then
995		#                      IPv4 header
996		exp_mtu=$((${ll_mtu} - 20))
997	else
998		#                      IPv6 header   Option 4
999		exp_mtu=$((${ll_mtu} - 40          - 8))
1000	fi
1001
1002	# Create route exception by exceeding link layer MTU
1003	mtu "${ns_a}"  veth_A-R1 $((${ll_mtu} + 1000))
1004	mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
1005	mtu "${ns_b}"  veth_B-R1 ${ll_mtu}
1006	mtu "${ns_r1}" veth_R1-B ${ll_mtu}
1007
1008	mtu "${ns_a}" ip_a $((${ll_mtu} + 1000)) || return
1009	mtu "${ns_b}" ip_b $((${ll_mtu} + 1000)) || return
1010	run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst}
1011
1012	# Check that exception was created
1013	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
1014	check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ip${inner}ip${outer} interface"
1015}
1016
1017test_pmtu_ipv4_ipv4_exception() {
1018	test_pmtu_ipvX_over_ipvY_exception 4 4
1019}
1020
1021test_pmtu_ipv6_ipv4_exception() {
1022	test_pmtu_ipvX_over_ipvY_exception 6 4
1023}
1024
1025test_pmtu_ipv4_ipv6_exception() {
1026	test_pmtu_ipvX_over_ipvY_exception 4 6
1027}
1028
1029test_pmtu_ipv6_ipv6_exception() {
1030	test_pmtu_ipvX_over_ipvY_exception 6 6
1031}
1032
1033test_pmtu_vti4_exception() {
1034	setup namespaces veth vti4 xfrm4 || return 2
1035	trace "${ns_a}" veth_a    "${ns_b}" veth_b \
1036	      "${ns_a}" vti4_a    "${ns_b}" vti4_b
1037
1038	veth_mtu=1500
1039	vti_mtu=$((veth_mtu - 20))
1040
1041	#                                SPI   SN   IV  ICV   pad length   next header
1042	esp_payload_rfc4106=$((vti_mtu - 4   - 4  - 8 - 16  - 1          - 1))
1043	ping_payload=$((esp_payload_rfc4106 - 28))
1044
1045	mtu "${ns_a}" veth_a ${veth_mtu}
1046	mtu "${ns_b}" veth_b ${veth_mtu}
1047	mtu "${ns_a}" vti4_a ${vti_mtu}
1048	mtu "${ns_b}" vti4_b ${vti_mtu}
1049
1050	# Send DF packet without exceeding link layer MTU, check that no
1051	# exception is created
1052	run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s ${ping_payload} ${tunnel4_b_addr}
1053	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})"
1054	check_pmtu_value "" "${pmtu}" "sending packet smaller than PMTU (IP payload length ${esp_payload_rfc4106})" || return 1
1055
1056	# Now exceed link layer MTU by one byte, check that exception is created
1057	# with the right PMTU value
1058	run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s $((ping_payload + 1)) ${tunnel4_b_addr}
1059	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})"
1060	check_pmtu_value "${esp_payload_rfc4106}" "${pmtu}" "exceeding PMTU (IP payload length $((esp_payload_rfc4106 + 1)))"
1061}
1062
1063test_pmtu_vti6_exception() {
1064	setup namespaces veth vti6 xfrm6 || return 2
1065	trace "${ns_a}" veth_a    "${ns_b}" veth_b \
1066	      "${ns_a}" vti6_a    "${ns_b}" vti6_b
1067	fail=0
1068
1069	# Create route exception by exceeding link layer MTU
1070	mtu "${ns_a}" veth_a 4000
1071	mtu "${ns_b}" veth_b 4000
1072	mtu "${ns_a}" vti6_a 5000
1073	mtu "${ns_b}" vti6_b 5000
1074	run_cmd ${ns_a} ${ping6} -q -i 0.1 -w 1 -s 60000 ${tunnel6_b_addr}
1075
1076	# Check that exception was created
1077	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
1078	check_pmtu_value any "${pmtu}" "creating tunnel exceeding link layer MTU" || return 1
1079
1080	# Decrease tunnel MTU, check for PMTU decrease in route exception
1081	mtu "${ns_a}" vti6_a 3000
1082	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
1083	check_pmtu_value "3000" "${pmtu}" "decreasing tunnel MTU" || fail=1
1084
1085	# Increase tunnel MTU, check for PMTU increase in route exception
1086	mtu "${ns_a}" vti6_a 9000
1087	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
1088	check_pmtu_value "9000" "${pmtu}" "increasing tunnel MTU" || fail=1
1089
1090	return ${fail}
1091}
1092
1093test_pmtu_vti4_default_mtu() {
1094	setup namespaces veth vti4 || return 2
1095
1096	# Check that MTU of vti device is MTU of veth minus IPv4 header length
1097	veth_mtu="$(link_get_mtu "${ns_a}" veth_a)"
1098	vti4_mtu="$(link_get_mtu "${ns_a}" vti4_a)"
1099	if [ $((veth_mtu - vti4_mtu)) -ne 20 ]; then
1100		err "  vti MTU ${vti4_mtu} is not veth MTU ${veth_mtu} minus IPv4 header length"
1101		return 1
1102	fi
1103}
1104
1105test_pmtu_vti6_default_mtu() {
1106	setup namespaces veth vti6 || return 2
1107
1108	# Check that MTU of vti device is MTU of veth minus IPv6 header length
1109	veth_mtu="$(link_get_mtu "${ns_a}" veth_a)"
1110	vti6_mtu="$(link_get_mtu "${ns_a}" vti6_a)"
1111	if [ $((veth_mtu - vti6_mtu)) -ne 40 ]; then
1112		err "  vti MTU ${vti6_mtu} is not veth MTU ${veth_mtu} minus IPv6 header length"
1113		return 1
1114	fi
1115}
1116
1117test_pmtu_vti4_link_add_mtu() {
1118	setup namespaces || return 2
1119
1120	run_cmd ${ns_a} ip link add vti4_a type vti local ${veth4_a_addr} remote ${veth4_b_addr} key 10
1121	[ $? -ne 0 ] && err "  vti not supported" && return 2
1122	run_cmd ${ns_a} ip link del vti4_a
1123
1124	fail=0
1125
1126	min=68
1127	max=$((65535 - 20))
1128	# Check invalid values first
1129	for v in $((min - 1)) $((max + 1)); do
1130		run_cmd ${ns_a} ip link add vti4_a mtu ${v} type vti local ${veth4_a_addr} remote ${veth4_b_addr} key 10
1131		# This can fail, or MTU can be adjusted to a proper value
1132		[ $? -ne 0 ] && continue
1133		mtu="$(link_get_mtu "${ns_a}" vti4_a)"
1134		if [ ${mtu} -lt ${min} -o ${mtu} -gt ${max} ]; then
1135			err "  vti tunnel created with invalid MTU ${mtu}"
1136			fail=1
1137		fi
1138		run_cmd ${ns_a} ip link del vti4_a
1139	done
1140
1141	# Now check valid values
1142	for v in ${min} 1300 ${max}; do
1143		run_cmd ${ns_a} ip link add vti4_a mtu ${v} type vti local ${veth4_a_addr} remote ${veth4_b_addr} key 10
1144		mtu="$(link_get_mtu "${ns_a}" vti4_a)"
1145		run_cmd ${ns_a} ip link del vti4_a
1146		if [ "${mtu}" != "${v}" ]; then
1147			err "  vti MTU ${mtu} doesn't match configured value ${v}"
1148			fail=1
1149		fi
1150	done
1151
1152	return ${fail}
1153}
1154
1155test_pmtu_vti6_link_add_mtu() {
1156	setup namespaces || return 2
1157
1158	run_cmd ${ns_a} ip link add vti6_a type vti6 local ${veth6_a_addr} remote ${veth6_b_addr} key 10
1159	[ $? -ne 0 ] && err "  vti6 not supported" && return 2
1160	run_cmd ${ns_a} ip link del vti6_a
1161
1162	fail=0
1163
1164	min=68			# vti6 can carry IPv4 packets too
1165	max=$((65535 - 40))
1166	# Check invalid values first
1167	for v in $((min - 1)) $((max + 1)); do
1168		run_cmd ${ns_a} ip link add vti6_a mtu ${v} type vti6 local ${veth6_a_addr} remote ${veth6_b_addr} key 10
1169		# This can fail, or MTU can be adjusted to a proper value
1170		[ $? -ne 0 ] && continue
1171		mtu="$(link_get_mtu "${ns_a}" vti6_a)"
1172		if [ ${mtu} -lt ${min} -o ${mtu} -gt ${max} ]; then
1173			err "  vti6 tunnel created with invalid MTU ${v}"
1174			fail=1
1175		fi
1176		run_cmd ${ns_a} ip link del vti6_a
1177	done
1178
1179	# Now check valid values
1180	for v in 68 1280 1300 $((65535 - 40)); do
1181		run_cmd ${ns_a} ip link add vti6_a mtu ${v} type vti6 local ${veth6_a_addr} remote ${veth6_b_addr} key 10
1182		mtu="$(link_get_mtu "${ns_a}" vti6_a)"
1183		run_cmd ${ns_a} ip link del vti6_a
1184		if [ "${mtu}" != "${v}" ]; then
1185			err "  vti6 MTU ${mtu} doesn't match configured value ${v}"
1186			fail=1
1187		fi
1188	done
1189
1190	return ${fail}
1191}
1192
1193test_pmtu_vti6_link_change_mtu() {
1194	setup namespaces || return 2
1195
1196	run_cmd ${ns_a} ip link add dummy0 mtu 1500 type dummy
1197	[ $? -ne 0 ] && err "  dummy not supported" && return 2
1198	run_cmd ${ns_a} ip link add dummy1 mtu 3000 type dummy
1199	run_cmd ${ns_a} ip link set dummy0 up
1200	run_cmd ${ns_a} ip link set dummy1 up
1201
1202	run_cmd ${ns_a} ip addr add ${dummy6_0_prefix}1/${dummy6_mask} dev dummy0
1203	run_cmd ${ns_a} ip addr add ${dummy6_1_prefix}1/${dummy6_mask} dev dummy1
1204
1205	fail=0
1206
1207	# Create vti6 interface bound to device, passing MTU, check it
1208	run_cmd ${ns_a} ip link add vti6_a mtu 1300 type vti6 remote ${dummy6_0_prefix}2 local ${dummy6_0_prefix}1
1209	mtu="$(link_get_mtu "${ns_a}" vti6_a)"
1210	if [ ${mtu} -ne 1300 ]; then
1211		err "  vti6 MTU ${mtu} doesn't match configured value 1300"
1212		fail=1
1213	fi
1214
1215	# Move to another device with different MTU, without passing MTU, check
1216	# MTU is adjusted
1217	run_cmd ${ns_a} ip link set vti6_a type vti6 remote ${dummy6_1_prefix}2 local ${dummy6_1_prefix}1
1218	mtu="$(link_get_mtu "${ns_a}" vti6_a)"
1219	if [ ${mtu} -ne $((3000 - 40)) ]; then
1220		err "  vti MTU ${mtu} is not dummy MTU 3000 minus IPv6 header length"
1221		fail=1
1222	fi
1223
1224	# Move it back, passing MTU, check MTU is not overridden
1225	run_cmd ${ns_a} ip link set vti6_a mtu 1280 type vti6 remote ${dummy6_0_prefix}2 local ${dummy6_0_prefix}1
1226	mtu="$(link_get_mtu "${ns_a}" vti6_a)"
1227	if [ ${mtu} -ne 1280 ]; then
1228		err "  vti6 MTU ${mtu} doesn't match configured value 1280"
1229		fail=1
1230	fi
1231
1232	return ${fail}
1233}
1234
1235check_command() {
1236	cmd=${1}
1237
1238	if ! which ${cmd} > /dev/null 2>&1; then
1239		err "  missing required command: '${cmd}'"
1240		return 1
1241	fi
1242	return 0
1243}
1244
1245test_cleanup_vxlanX_exception() {
1246	outer="${1}"
1247	encap="vxlan"
1248	ll_mtu=4000
1249
1250	check_command taskset || return 2
1251	cpu_list=$(grep -m 2 processor /proc/cpuinfo | cut -d ' ' -f 2)
1252
1253	setup namespaces routing ${encap}${outer} || return 2
1254	trace "${ns_a}" ${encap}_a   "${ns_b}"  ${encap}_b \
1255	      "${ns_a}" veth_A-R1    "${ns_r1}" veth_R1-A \
1256	      "${ns_b}" veth_B-R1    "${ns_r1}" veth_R1-B
1257
1258	# Create route exception by exceeding link layer MTU
1259	mtu "${ns_a}"  veth_A-R1 $((${ll_mtu} + 1000))
1260	mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
1261	mtu "${ns_b}"  veth_B-R1 ${ll_mtu}
1262	mtu "${ns_r1}" veth_R1-B ${ll_mtu}
1263
1264	mtu "${ns_a}" ${encap}_a $((${ll_mtu} + 1000))
1265	mtu "${ns_b}" ${encap}_b $((${ll_mtu} + 1000))
1266
1267	# Fill exception cache for multiple CPUs (2)
1268	# we can always use inner IPv4 for that
1269	for cpu in ${cpu_list}; do
1270		run_cmd taskset --cpu-list ${cpu} ${ns_a} ping -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${tunnel4_b_addr}
1271	done
1272
1273	${ns_a} ip link del dev veth_A-R1 &
1274	iplink_pid=$!
1275	sleep 1
1276	if [ "$(cat /proc/${iplink_pid}/cmdline 2>/dev/null | tr -d '\0')" = "iplinkdeldevveth_A-R1" ]; then
1277		err "  can't delete veth device in a timely manner, PMTU dst likely leaked"
1278		return 1
1279	fi
1280}
1281
1282test_cleanup_ipv6_exception() {
1283	test_cleanup_vxlanX_exception 6
1284}
1285
1286test_cleanup_ipv4_exception() {
1287	test_cleanup_vxlanX_exception 4
1288}
1289
1290run_test() {
1291	(
1292	tname="$1"
1293	tdesc="$2"
1294
1295	unset IFS
1296
1297	if [ "$VERBOSE" = "1" ]; then
1298		printf "\n##########################################################################\n\n"
1299	fi
1300
1301	eval test_${tname}
1302	ret=$?
1303
1304	if [ $ret -eq 0 ]; then
1305		printf "TEST: %-60s  [ OK ]\n" "${tdesc}"
1306	elif [ $ret -eq 1 ]; then
1307		printf "TEST: %-60s  [FAIL]\n" "${tdesc}"
1308		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
1309			echo
1310			echo "Pausing. Hit enter to continue"
1311			read a
1312		fi
1313		err_flush
1314		exit 1
1315	elif [ $ret -eq 2 ]; then
1316		printf "TEST: %-60s  [SKIP]\n" "${tdesc}"
1317		err_flush
1318	fi
1319
1320	return $ret
1321	)
1322	ret=$?
1323	[ $ret -ne 0 ] && exitcode=1
1324
1325	return $ret
1326}
1327
1328run_test_nh() {
1329	tname="$1"
1330	tdesc="$2"
1331
1332	USE_NH=yes
1333	run_test "${tname}" "${tdesc} - nexthop objects"
1334	USE_NH=no
1335}
1336
1337test_list_flush_ipv4_exception() {
1338	setup namespaces routing || return 2
1339	trace "${ns_a}"  veth_A-R1    "${ns_r1}" veth_R1-A \
1340	      "${ns_r1}" veth_R1-B    "${ns_b}"  veth_B-R1 \
1341	      "${ns_a}"  veth_A-R2    "${ns_r2}" veth_R2-A \
1342	      "${ns_r2}" veth_R2-B    "${ns_b}"  veth_B-R2
1343
1344	dst_prefix1="${prefix4}.${b_r1}."
1345	dst2="${prefix4}.${b_r2}.1"
1346
1347	# Set up initial MTU values
1348	mtu "${ns_a}"  veth_A-R1 2000
1349	mtu "${ns_r1}" veth_R1-A 2000
1350	mtu "${ns_r1}" veth_R1-B 1500
1351	mtu "${ns_b}"  veth_B-R1 1500
1352
1353	mtu "${ns_a}"  veth_A-R2 2000
1354	mtu "${ns_r2}" veth_R2-A 2000
1355	mtu "${ns_r2}" veth_R2-B 1500
1356	mtu "${ns_b}"  veth_B-R2 1500
1357
1358	fail=0
1359
1360	# Add 100 addresses for veth endpoint on B reached by default A route
1361	for i in $(seq 100 199); do
1362		run_cmd ${ns_b} ip addr add "${dst_prefix1}${i}" dev veth_B-R1
1363	done
1364
1365	# Create 100 cached route exceptions for path via R1, one via R2. Note
1366	# that with IPv4 we need to actually cause a route lookup that matches
1367	# the exception caused by ICMP, in order to actually have a cached
1368	# route, so we need to ping each destination twice
1369	for i in $(seq 100 199); do
1370		run_cmd ${ns_a} ping -q -M want -i 0.1 -c 2 -s 1800 "${dst_prefix1}${i}"
1371	done
1372	run_cmd ${ns_a} ping -q -M want -i 0.1 -c 2 -s 1800 "${dst2}"
1373
1374	if [ "$(${ns_a} ip -oneline route list cache | wc -l)" -ne 101 ]; then
1375		err "  can't list cached exceptions"
1376		fail=1
1377	fi
1378
1379	run_cmd ${ns_a} ip route flush cache
1380	pmtu1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst_prefix}1)"
1381	pmtu2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst_prefix}2)"
1382	if [ -n "${pmtu1}" ] || [ -n "${pmtu2}" ] || \
1383	   [ -n "$(${ns_a} ip route list cache)" ]; then
1384		err "  can't flush cached exceptions"
1385		fail=1
1386	fi
1387
1388	return ${fail}
1389}
1390
1391test_list_flush_ipv6_exception() {
1392	setup namespaces routing || return 2
1393	trace "${ns_a}"  veth_A-R1    "${ns_r1}" veth_R1-A \
1394	      "${ns_r1}" veth_R1-B    "${ns_b}"  veth_B-R1 \
1395	      "${ns_a}"  veth_A-R2    "${ns_r2}" veth_R2-A \
1396	      "${ns_r2}" veth_R2-B    "${ns_b}"  veth_B-R2
1397
1398	dst_prefix1="${prefix6}:${b_r1}::"
1399	dst2="${prefix6}:${b_r2}::1"
1400
1401	# Set up initial MTU values
1402	mtu "${ns_a}"  veth_A-R1 2000
1403	mtu "${ns_r1}" veth_R1-A 2000
1404	mtu "${ns_r1}" veth_R1-B 1500
1405	mtu "${ns_b}"  veth_B-R1 1500
1406
1407	mtu "${ns_a}"  veth_A-R2 2000
1408	mtu "${ns_r2}" veth_R2-A 2000
1409	mtu "${ns_r2}" veth_R2-B 1500
1410	mtu "${ns_b}"  veth_B-R2 1500
1411
1412	fail=0
1413
1414	# Add 100 addresses for veth endpoint on B reached by default A route
1415	for i in $(seq 100 199); do
1416		run_cmd ${ns_b} ip addr add "${dst_prefix1}${i}" dev veth_B-R1
1417	done
1418
1419	# Create 100 cached route exceptions for path via R1, one via R2
1420	for i in $(seq 100 199); do
1421		run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s 1800 "${dst_prefix1}${i}"
1422	done
1423	run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s 1800 "${dst2}"
1424	if [ "$(${ns_a} ip -oneline -6 route list cache | wc -l)" -ne 101 ]; then
1425		err "  can't list cached exceptions"
1426		fail=1
1427	fi
1428
1429	run_cmd ${ns_a} ip -6 route flush cache
1430	pmtu1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst_prefix1}100")"
1431	pmtu2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
1432	if [ -n "${pmtu1}" ] || [ -n "${pmtu2}" ] || \
1433	   [ -n "$(${ns_a} ip -6 route list cache)" ]; then
1434		err "  can't flush cached exceptions"
1435		fail=1
1436	fi
1437
1438	return ${fail}
1439}
1440
1441usage() {
1442	echo
1443	echo "$0 [OPTIONS] [TEST]..."
1444	echo "If no TEST argument is given, all tests will be run."
1445	echo
1446	echo "Options"
1447	echo "  --trace: capture traffic to TEST_INTERFACE.pcap"
1448	echo
1449	echo "Available tests${tests}"
1450	exit 1
1451}
1452
1453################################################################################
1454#
1455exitcode=0
1456desc=0
1457
1458while getopts :ptv o
1459do
1460	case $o in
1461	p) PAUSE_ON_FAIL=yes;;
1462	v) VERBOSE=1;;
1463	t) if which tcpdump > /dev/null 2>&1; then
1464		TRACING=1
1465	   else
1466		echo "=== tcpdump not available, tracing disabled"
1467	   fi
1468	   ;;
1469	*) usage;;
1470	esac
1471done
1472shift $(($OPTIND-1))
1473
1474IFS="
1475"
1476
1477for arg do
1478	# Check first that all requested tests are available before running any
1479	command -v > /dev/null "test_${arg}" || { echo "=== Test ${arg} not found"; usage; }
1480done
1481
1482trap cleanup EXIT
1483
1484# start clean
1485cleanup
1486
1487HAVE_NH=no
1488ip nexthop ls >/dev/null 2>&1
1489[ $? -eq 0 ] && HAVE_NH=yes
1490
1491name=""
1492desc=""
1493rerun_nh=0
1494for t in ${tests}; do
1495	[ "${name}" = "" ]	&& name="${t}"	&& continue
1496	[ "${desc}" = "" ]	&& desc="${t}"	&& continue
1497
1498	if [ "${HAVE_NH}" = "yes" ]; then
1499		rerun_nh="${t}"
1500	fi
1501
1502	run_this=1
1503	for arg do
1504		[ "${arg}" != "${arg#--*}" ] && continue
1505		[ "${arg}" = "${name}" ] && run_this=1 && break
1506		run_this=0
1507	done
1508	if [ $run_this -eq 1 ]; then
1509		run_test "${name}" "${desc}"
1510		# if test was skipped no need to retry with nexthop objects
1511		[ $? -eq 2 ] && rerun_nh=0
1512
1513		if [ "${rerun_nh}" = "1" ]; then
1514			run_test_nh "${name}" "${desc}"
1515		fi
1516	fi
1517	name=""
1518	desc=""
1519	rerun_nh=0
1520done
1521
1522exit ${exitcode}
1523