xref: /linux/tools/testing/selftests/rcutorture/bin/kvm-remote.sh (revision d97e2634fbdcd238a51bc363267df0139c17f4da)
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0+
3#
4# Run a series of tests on remote systems under KVM.
5#
6# Usage: kvm-remote.sh "systems" [ <kvm.sh args> ]
7#	 kvm-remote.sh "systems" /path/to/old/run [ <kvm-again.sh args> ]
8#
9# Copyright (C) 2021 Facebook, Inc.
10#
11# Authors: Paul E. McKenney <paulmck@kernel.org>
12
13scriptname=$0
14args="$*"
15
16if ! test -d tools/testing/selftests/rcutorture/bin
17then
18	echo $scriptname must be run from top-level directory of kernel source tree.
19	exit 1
20fi
21
22RCUTORTURE="`pwd`/tools/testing/selftests/rcutorture"; export RCUTORTURE
23PATH=${RCUTORTURE}/bin:$PATH; export PATH
24. functions.sh
25
26starttime="`get_starttime`"
27
28systems="$1"
29if test -z "$systems"
30then
31	echo $scriptname: Empty list of systems will go nowhere good, giving up.
32	exit 1
33fi
34shift
35
36# Pathnames:
37# T:	  /tmp/kvm-remote.sh.NNNNNN where "NNNNNN" is set by mktemp
38# resdir: /tmp/kvm-remote.sh.NNNNNN/res
39# rundir: /tmp/kvm-remote.sh.NNNNNN/res/$ds ("-remote" suffix)
40# oldrun: `pwd`/tools/testing/.../res/$otherds
41#
42# Pathname segments:
43# TD:	  kvm-remote.sh.NNNNNN
44# ds:	  yyyy.mm.dd-hh.mm.ss-remote
45
46T="`mktemp -d ${TMPDIR-/tmp}/kvm-remote.sh.XXXXXX`"
47trap 'rm -rf $T' 0
48TD="`basename "$T"`"
49
50resdir="$T/res"
51ds=`date +%Y.%m.%d-%H.%M.%S`-remote
52rundir=$resdir/$ds
53echo Results directory: $rundir
54echo $scriptname $args
55if echo $1 | grep -q '^--'
56then
57	# Fresh build.  Create a datestamp unless the caller supplied one.
58	datestamp="`echo "$@" | awk -v ds="$ds" '{
59		for (i = 1; i < NF; i++) {
60			if ($i == "--datestamp") {
61				ds = "";
62				break;
63			}
64		}
65		if (ds != "")
66			print "--datestamp " ds;
67	}'`"
68	kvm.sh --remote "$@" $datestamp --buildonly > $T/kvm.sh.out 2>&1
69	ret=$?
70	if test "$ret" -ne 0
71	then
72		echo $scriptname: kvm.sh failed exit code $?
73		cat $T/kvm.sh.out
74		exit 2
75	fi
76	oldrun="`grep -m 1 "^Results directory: " $T/kvm.sh.out | awk '{ print $3 }'`"
77	touch "$oldrun/remote-log"
78	echo $scriptname $args >> "$oldrun/remote-log"
79	echo | tee -a "$oldrun/remote-log"
80	echo " ----" kvm.sh output: "(`date`)" | tee -a "$oldrun/remote-log"
81	cat $T/kvm.sh.out | tee -a "$oldrun/remote-log"
82	# We are going to run this, so remove the buildonly files.
83	rm -f "$oldrun"/*/buildonly
84	kvm-again.sh $oldrun --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1
85	ret=$?
86	if test "$ret" -ne 0
87	then
88		echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log"
89		cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log"
90		exit 2
91	fi
92else
93	# Re-use old run.
94	oldrun="$1"
95	if ! echo $oldrun | grep -q '^/'
96	then
97		oldrun="`pwd`/$oldrun"
98	fi
99	shift
100	touch "$oldrun/remote-log"
101	echo $scriptname $args >> "$oldrun/remote-log"
102	kvm-again.sh "$oldrun" "$@" --dryrun --remote --rundir "$rundir" > $T/kvm-again.sh.out 2>&1
103	ret=$?
104	if test "$ret" -ne 0
105	then
106		echo $scriptname: kvm-again.sh failed exit code $? | tee -a "$oldrun/remote-log"
107		cat $T/kvm-again.sh.out | tee -a "$oldrun/remote-log"
108		exit 2
109	fi
110	cp -a "$rundir" "$RCUTORTURE/res/"
111	oldrun="$RCUTORTURE/res/$ds"
112fi
113echo | tee -a "$oldrun/remote-log"
114echo " ----" kvm-again.sh output: "(`date`)" | tee -a "$oldrun/remote-log"
115cat $T/kvm-again.sh.out
116echo | tee -a "$oldrun/remote-log"
117echo Remote run directory: $rundir | tee -a "$oldrun/remote-log"
118echo Local build-side run directory: $oldrun | tee -a "$oldrun/remote-log"
119
120# Create the kvm-remote-N.sh scripts in the bin directory.
121awk < "$rundir"/scenarios -v dest="$T/bin" -v rundir="$rundir" '
122{
123	n = $1;
124	sub(/\./, "", n);
125	fn = dest "/kvm-remote-" n ".sh"
126	print "kvm-remote-noreap.sh " rundir " &" > fn;
127	scenarios = "";
128	for (i = 2; i <= NF; i++)
129		scenarios = scenarios " " $i;
130	print "kvm-test-1-run-batch.sh" scenarios >> fn;
131	print "sync" >> fn;
132	print "rm " rundir "/remote.run" >> fn;
133}'
134chmod +x $T/bin/kvm-remote-*.sh
135( cd "`dirname $T`"; tar -chzf $T/binres.tgz "$TD/bin" "$TD/res" )
136
137# Check first to avoid the need for cleanup for system-name typos
138for i in $systems
139do
140	ssh -o BatchMode=yes $i getconf _NPROCESSORS_ONLN > $T/ssh.stdout 2> $T/ssh.stderr
141	ret=$?
142	if test "$ret" -ne 0
143	then
144		echo "System $i unreachable ($ret), giving up." | tee -a "$oldrun/remote-log"
145		echo ' --- ssh stdout: vvv' | tee -a "$oldrun/remote-log"
146		cat $T/ssh.stdout | tee -a "$oldrun/remote-log"
147		echo ' --- ssh stdout: ^^^' | tee -a "$oldrun/remote-log"
148		echo ' --- ssh stderr: vvv' | tee -a "$oldrun/remote-log"
149		cat $T/ssh.stderr | tee -a "$oldrun/remote-log"
150		echo ' --- ssh stderr: ^^^' | tee -a "$oldrun/remote-log"
151		exit 4
152	fi
153	echo $i: `cat $T/ssh.stdout` CPUs " " `date` | tee -a "$oldrun/remote-log"
154done
155
156# Download and expand the tarball on all systems.
157echo Build-products tarball: `du -h $T/binres.tgz` | tee -a "$oldrun/remote-log"
158for i in $systems
159do
160	echo Downloading tarball to $i `date` | tee -a "$oldrun/remote-log"
161	cat $T/binres.tgz | ssh -o BatchMode=yes $i "cd /tmp; tar -xzf -"
162	ret=$?
163	tries=0
164	while test "$ret" -ne 0
165	do
166		echo Unable to download $T/binres.tgz to system $i, waiting and then retrying.  $tries prior retries. | tee -a "$oldrun/remote-log"
167		sleep 60
168		cat $T/binres.tgz | ssh -o BatchMode=yes $i "cd /tmp; tar -xzf -"
169		ret=$?
170		if test "$ret" -ne 0
171		then
172			if test "$tries" > 5
173			then
174				echo Unable to download $T/binres.tgz to system $i, giving up. | tee -a "$oldrun/remote-log"
175				exit 10
176			fi
177		fi
178		tries=$((tries+1))
179	done
180done
181
182# Function to check for presence of a file on the specified system.
183# Complain if the system cannot be reached, and retry after a wait.
184# Currently just waits 15 minutes if a machine disappears.
185#
186# Usage: checkremotefile system pathname
187checkremotefile () {
188	local nsshfails=0
189	local ret
190	local sleeptime=60
191
192	while :
193	do
194		ssh -o BatchMode=yes $1 "test -f \"$2\""
195		ret=$?
196		if test "$ret" -eq 255
197		then
198			echo " ---" ssh failure to $1 checking for file $2, retry after $sleeptime seconds. `date` | tee -a "$oldrun/remote-log"
199			nsshfails=$((nsshfails+1))
200			if ((nsshfails > 15))
201			then
202				return 255
203			fi
204		elif test "$ret" -eq 0
205		then
206			return 0
207		elif test "$ret" -eq 1
208		then
209			echo " ---" File \"$2\" not found: ssh $1 test -f \"$2\" | tee -a "$oldrun/remote-log"
210			return 1
211		else
212			echo " ---" Exit code $ret: ssh $1 test -f \"$2\", retry after $sleeptime seconds. `date` | tee -a "$oldrun/remote-log"
213			return $ret
214		fi
215		sleep $sleeptime
216	done
217}
218
219# Function to start batches on idle remote $systems
220#
221# Usage: startbatches curbatch nbatches
222#
223# Batches are numbered starting at 1.  Returns the next batch to start.
224# Be careful to redirect all debug output to FD 2 (stderr).
225startbatches () {
226	local curbatch="$1"
227	local nbatches="$2"
228	local ret
229
230	# Each pass through the following loop examines one system.
231	for i in $systems
232	do
233		if test "$curbatch" -gt "$nbatches"
234		then
235			echo $((nbatches + 1))
236			return 0
237		fi
238		if checkremotefile "$i" "$resdir/$ds/remote.run" 1>&2
239		then
240			continue # System still running last test, skip.
241		fi
242		ssh -o BatchMode=yes "$i" "cd \"$resdir/$ds\"; touch remote.run; PATH=\"$T/bin:$PATH\" nohup kvm-remote-$curbatch.sh > kvm-remote-$curbatch.sh.out 2>&1 &" 1>&2
243		ret=$?
244		if test "$ret" -ne 0
245		then
246			echo ssh $i failed: exitcode $ret 1>&2
247			exit 11
248		fi
249		echo " ----" System $i Batch `head -n $curbatch < "$rundir"/scenarios | tail -1` `date` 1>&2
250		curbatch=$((curbatch + 1))
251	done
252	echo $curbatch
253}
254
255# Launch all the scenarios.
256nbatches="`wc -l "$rundir"/scenarios | awk '{ print $1 }'`"
257curbatch=1
258while test "$curbatch" -le "$nbatches"
259do
260	startbatches $curbatch $nbatches > $T/curbatch 2> $T/startbatches.stderr
261	curbatch="`cat $T/curbatch`"
262	if test -s "$T/startbatches.stderr"
263	then
264		cat "$T/startbatches.stderr" | tee -a "$oldrun/remote-log"
265	fi
266	if test "$curbatch" -le "$nbatches"
267	then
268		sleep 30
269	fi
270done
271echo All batches started. `date` | tee -a "$oldrun/remote-log"
272
273# Wait for all remaining scenarios to complete and collect results.
274for i in $systems
275do
276	echo " ---" Waiting for $i `date` | tee -a "$oldrun/remote-log"
277	while :
278	do
279		checkremotefile "$i" "$resdir/$ds/remote.run"
280		ret=$?
281		if test "$ret" -eq 1
282		then
283			echo " ---" Collecting results from $i `date` | tee -a "$oldrun/remote-log"
284			( cd "$oldrun"; ssh -o BatchMode=yes $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu[_-]pid */qemu-retval */qemu-affinity; rm -rf $T > /dev/null 2>&1" | tar -xzf - )
285			break;
286		fi
287		if test "$ret" -eq 255
288		then
289			echo System $i persistent ssh failure, lost results `date` | tee -a "$oldrun/remote-log"
290			break;
291		fi
292		sleep 30
293	done
294done
295
296( kvm-end-run-stats.sh "$oldrun" "$starttime"; echo $? > $T/exitcode ) | tee -a "$oldrun/remote-log"
297exit "`cat $T/exitcode`"
298