xref: /illumos-gate/usr/src/cmd/intrd/intrd.pl (revision 814a60b13c0ad90e5d2edfd29a7a84bbf416cc1a)
1#!/usr/perl5/bin/perl
2#
3# CDDL HEADER START
4#
5# The contents of this file are subject to the terms of the
6# Common Development and Distribution License, Version 1.0 only
7# (the "License").  You may not use this file except in compliance
8# with the License.
9#
10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11# or http://www.opensolaris.org/os/licensing.
12# See the License for the specific language governing permissions
13# and limitations under the License.
14#
15# When distributing Covered Code, include this CDDL HEADER in each
16# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17# If applicable, add the following below this CDDL HEADER, with the
18# fields enclosed by brackets "[]" replaced with your own identifying
19# information: Portions Copyright [yyyy] [name of copyright owner]
20#
21# CDDL HEADER END
22#
23
24#
25# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
26# Use is subject to license terms.
27#
28#ident	"%Z%%M%	%I%	%E% SMI"
29#
30
31require 5.6.1;
32use strict;
33use warnings;
34use POSIX;
35use File::Basename("basename");
36
37my $cmdname = basename($0);
38
39my $using_scengen = 0;	# 1 if using scenario simulator
40my $debug = 0;
41
42my $min_sleeptime = 1;
43my $max_sleeptime = 15;
44my $onecpu_sleeptime = (60 * 15);	# used if only 1 CPU on system
45my $sleeptime = $min_sleeptime;	# time to sleep between kstat updates
46
47# For timerange_foo variables, see comments at tail of &getstat()
48
49my $timerange_toohi    = .01;
50my $timerange_hithresh = .0003;
51my $timerange_lothresh = $timerange_hithresh / 2;
52my $unsafe_timerange   = .02;
53
54my $statslen = 60;	# time period (in secs) to keep in @deltas
55
56
57# Parse arguments. intrd does not accept any public arguments; the two
58# arguments below are meant for testing purposes. -D generates a significant
59# amount of syslog output. -S <filename> loads the filename as a perl
60# script. That file is expected to implement a kstat "simulator" which
61# can be used to feed information to intrd and verify intrd's responses.
62
63while ($_ = shift @ARGV) {
64	if ($_ eq "-S" && $#ARGV != -1) {
65		$using_scengen = 1;
66		do $ARGV[0];	# load simulator
67		shift @ARGV;
68	} elsif ($_ eq "-D") {
69		$debug = 1;
70	}
71}
72
73if ($using_scengen == 0) {
74	require Sun::Solaris::Kstat;
75	require Sun::Solaris::Intrs;
76	import Sun::Solaris::Intrs(qw(intrmove));
77	require Sys::Syslog;
78	import Sys::Syslog;
79	openlog($cmdname, 'pid', 'daemon');
80	setlogmask(Sys::Syslog::LOG_UPTO($debug > 0 ? &Sys::Syslog::LOG_DEBUG :
81	    &Sys::Syslog::LOG_INFO));
82}
83
84
85my $asserted = 0;
86my $assert_level = 'debug';	# syslog level for assertion failures
87sub VERIFY($@)
88{
89	my $bad = (shift() == 0);	# $_[0] == 0 means assert failed
90	if ($bad) {
91		my $msg = shift();
92		syslog($assert_level, "VERIFY: $msg", @_);
93		$asserted++;
94	}
95	return ($bad);
96}
97
98
99
100
101sub getstat($);
102sub generate_delta($$);
103sub compress_deltas($);
104sub dumpdelta($);
105
106sub goodness($);
107sub imbalanced($$);
108sub do_reconfig($);
109
110sub goodness_cpu($$);		# private function
111sub move_intr($$$$);		# private function
112sub ivecs_to_string(@);		# private function
113sub do_find_goal($$$$);		# private function
114sub find_goal($$);		# private function
115sub do_reconfig_cpu2cpu($$$$);	# private function
116sub do_reconfig_cpu($$$);	# private function
117
118
119#
120# What follow are the basic data structures routines of intrd.
121#
122# getstat() is responsible for reading the kstats and generating a "stat" hash.
123#
124# generate_delta() is responsible for taking two "stat" hashes and creating
125# a new "delta" hash that represents what has changed over time.
126#
127# compress_deltas() is responsible for taking a list of deltas and generating
128# a single delta hash that encompasses all the time periods described by the
129# deltas.
130
131
132#
133# getstat() is handed a reference to a kstat and generates a hash, returned
134# by reference, containing all the fields from the kstats which we need.
135# If it returns the scalar 0, it failed to gather the kstats, and the caller
136# should react accordingly.
137#
138# getstat() is also responsible for maintaining a reasonable $sleeptime.
139#
140# {"snaptime"}          kstat's snaptime
141# {<cpuid>}             one hash reference per online cpu
142#  ->{"tot"}            == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle}
143#  ->{"crtime"}         == cpu:<cpuid>:sys:crtime
144#  ->{"ivecs"}
145#     ->{<cookie#>}     iterates over pci_intrs::config:cookie
146#        ->{"time"}     == pci_intrs:<ivec#>:config:time (in nsec)
147#        ->{"pil"}      == pci_intrs:<ivec#>:config:pil
148#        ->{"crtime"}   == pci_intrs:<ivec#>:config:crtime
149#        ->{"ino"}      == pci_intrs:<ivec#>:config:ino
150#        ->{"buspath"}  == pci_intrs:<ivec#>:config:buspath
151#        ->{"name"}     == pci_intrs:<ivec#>:config:name
152#        ->{"ihs"}      == pci_intrs:<ivec#>:config:ihs
153#
154
155sub getstat($)
156{
157	my ($ks) = @_;
158
159	my $cpucnt = 0;
160	my %stat = ();
161	my ($minsnap, $maxsnap);
162
163	# kstats are not generated atomically. Each kstat hierarchy will
164	# have been generated within the kernel at a different time. On a
165	# thrashing system, we may not run quickly enough in order to get
166	# coherent kstat timing information across all the kstats. To
167	# determine if this is occurring, $minsnap/$maxsnap are used to
168	# find the breadth between the first and last snaptime of all the
169	# kstats we access. $maxsnap - $minsnap roughly represents the
170	# total time taken up in getstat(). If this time approaches the
171	# time between snapshots, our results may not be useful.
172
173	$minsnap = -1;		# snaptime is always a positive number
174	$maxsnap = $minsnap;
175
176	# Iterate over the cpus in cpu:<cpuid>::. Check
177	# cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the
178	# processor is "on-line". If not, it isn't accepting interrupts
179	# and doesn't concern us.
180	#
181	# Record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap.
182
183	while (my ($cpu, $cpst) = each %{$ks->{cpu}}) {
184		next if !exists($ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state});
185		my $state = $ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state};
186		next if ($state !~ /^on-line\0/);
187		my $cpu_sys = $cpst->{sys};
188
189		$stat{$cpu}{tot} = ($cpu_sys->{cpu_nsec_idle} +
190				    $cpu_sys->{cpu_nsec_user} +
191				    $cpu_sys->{cpu_nsec_kernel});
192		$stat{$cpu}{crtime} = $cpu_sys->{crtime};
193		$stat{$cpu}{ivecs} = {};
194
195		if ($minsnap == -1 || $cpu_sys->{snaptime} < $minsnap) {
196			$minsnap = $cpu_sys->{snaptime};
197		}
198		if ($cpu_sys->{snaptime} > $maxsnap) {
199			$maxsnap = $cpu_sys->{snaptime};
200		}
201		$cpucnt++;
202	}
203
204	if ($cpucnt <= 1) {
205		$sleeptime = $onecpu_sleeptime;
206		return (0);	# nothing to do with 1 CPU
207	}
208
209	# Iterate over the ivecs. If the cpu is not on-line, ignore the
210	# ivecs mapped to it, if any.
211	#
212	# Record pci_intrs:{inum}:config:time, snaptime, crtime, pil,
213	# ino, name, and buspath. Check $minsnap/$maxsnap.
214
215	foreach my $inst (values(%{$ks->{pci_intrs}})) {
216		my $intrcfg = $inst->{config};
217		my $cpu = $intrcfg->{cpu};
218
219		next unless exists $stat{$cpu};
220
221		if ($intrcfg->{snaptime} < $minsnap) {
222			$minsnap = $intrcfg->{snaptime};
223		} elsif ($intrcfg->{snaptime} > $maxsnap) {
224			$maxsnap = $intrcfg->{snaptime};
225		}
226
227		my $cookie = "$intrcfg->{buspath} $intrcfg->{ino}";
228		if (exists $stat{$cpu}{ivecs}{$cookie}) {
229			my $cookiestats = $stat{$cpu}{ivecs}{$cookie};
230
231			$cookiestats->{time} += $intrcfg->{time};
232			$cookiestats->{name} .= "/$intrcfg->{name}";
233
234			# If this new interrupt sharing $cookie represents a
235			# change from an earlier getstat, make sure that
236			# generate_delta will see the change by setting
237			# crtime to the most recent crtime of its components.
238
239			if ($intrcfg->{crtime} > $cookiestats->{crtime}) {
240				$cookiestats->{crtime} = $intrcfg->{crtime};
241			}
242			$cookiestats->{ihs}++;
243			next;
244		}
245		$stat{$cpu}{ivecs}{$cookie}{time} = $intrcfg->{time};
246		$stat{$cpu}{ivecs}{$cookie}{crtime} = $intrcfg->{crtime};
247		$stat{$cpu}{ivecs}{$cookie}{pil} = $intrcfg->{pil};
248		$stat{$cpu}{ivecs}{$cookie}{ino} = $intrcfg->{ino};
249		$stat{$cpu}{ivecs}{$cookie}{buspath} = $intrcfg->{buspath};
250		$stat{$cpu}{ivecs}{$cookie}{name} = $intrcfg->{name};
251		$stat{$cpu}{ivecs}{$cookie}{ihs} = 1;
252	}
253
254	# We define the timerange as the amount of time spent gathering the
255	# various kstats, divided by our sleeptime. If we take a lot of time
256	# to access the kstats, and then we create a delta comparing these
257	# kstats with a prior set of kstats, that delta will cover
258	# substaintially different amount of time depending upon which
259	# interrupt or CPU is being examined.
260	#
261	# By checking the timerange here, we guarantee that any deltas
262	# created from these kstats will contain self-consistent data,
263	# in that all CPUs and interrupts cover a similar span of time.
264	#
265	# We attempt to keep this timerange between $timerange_lothresh and
266	# $timerange_hithresh. If the timerange gets too large, not only are
267	# there the accuracy concerns above, but it means that intrd is using
268	# a lot of CPU time. If the timerange gets too small, that means our
269	# sleep time is large, and we could fail to react quickly enough to a
270	# sudden change.
271	#
272	# Finally, $timerange_toohi is the upper bound. Any timerange above
273	# this is thrown out as garbage. If the stat is safely within this
274	# bound, we treat the stat as representing an instant in time, rather
275	# than the time range it actually spans. We arbitrarily choose minsnap
276	# as the snaptime of the stat.
277
278	$stat{snaptime} = $minsnap;
279	my $timerange = ($maxsnap - $minsnap) / $sleeptime;
280	if ($sleeptime == $onecpu_sleeptime) {
281		$sleeptime = $min_sleeptime; # time to come out of idling
282	} elsif ($timerange > $timerange_hithresh &&
283	    $sleeptime < $max_sleeptime) {
284		$sleeptime++;
285	} elsif ($timerange < $timerange_lothresh &&
286	    $sleeptime > $min_sleeptime) {
287		$sleeptime--;
288	}
289	return (0) if ($timerange > $timerange_toohi);	# i.e. failure
290	return (\%stat);
291}
292
293#
294# dumpdelta takes a reference to our "delta" structure:
295# {"missing"}           "1" if the delta's component stats had inconsistencies
296# {"minsnap"}           time of the first kstat snaptime used in this delta
297# {"maxsnap"}           time of the last kstat snaptime used in this delta
298# {"goodness"}          cost function applied to this delta
299# {"avgintrload"}       avg of interrupt load across cpus, as a percentage
300# {"avgintrnsec"}       avg number of nsec spent in interrupts, per cpu
301# {<cpuid>}             iterates over on-line cpus
302#  ->{"intrs"}          cpu's movable intr time (sum of "time" for each ivec)
303#  ->{"tot"}            CPU load from all sources
304#  ->{"bigintr"}        largest value of {ivecs}{<ivec#>}{time} from below
305#  ->{"intrload"}       intrs / tot
306#  ->{"ivecs"}
307#     ->{<ivec#>}       iterates over ivecs for this cpu
308#        ->{"time"}     time used by this interrupt (in nsec)
309#        ->{"pil"}      pil level of this interrupt
310#        ->{"ino"}      interrupt number
311#        ->{"buspath"}  filename of the directory of the device's bus
312#        ->{"name"}     device name
313#        ->{"ihs"}      number of different handlers sharing this ino
314#
315# It prints out the delta structure in a nice, human readable display.
316#
317
318sub dumpdelta($)
319{
320	my ($delta) = @_;
321
322	# print global info
323
324	syslog('debug', "dumpdelta:");
325	syslog('debug', " RECONFIGURATION IN DELTA") if $delta->{missing} > 0;
326	syslog('debug', " avgintrload: %5.2f%%  avgintrnsec: %d",
327	       $delta->{avgintrload} * 100, $delta->{avgintrnsec});
328	syslog('debug', "    goodness: %5.2f%%", $delta->{goodness} * 100)
329	    if exists($delta->{goodness});
330
331	# iterate over cpus
332
333	while (my ($cpu, $cpst) = each %$delta) {
334		next if !ref($cpst);		# skip non-cpuid entries
335		my $tot = $cpst->{tot};
336		syslog('debug', "    cpu %3d intr %7.3f%%  (bigintr %7.3f%%)",
337		       $cpu, $cpst->{intrload}*100, $cpst->{bigintr}*100/$tot);
338		syslog('debug', "        intrs %d, bigintr %d",
339		       $cpst->{intrs}, $cpst->{bigintr});
340
341		# iterate over ivecs on this cpu
342
343		while (my ($ivec, $ivst) = each %{$cpst->{ivecs}}) {
344			syslog('debug', "    %15s:%5d: %7.3f%%  %d",
345			       ($ivst->{ihs} > 1 ?
346				"$ivst->{name}($ivst->{ihs})" :
347				$ivst->{name}),
348			       $ivec, $ivst->{time}*100 / $tot, $ivst->{time});
349		}
350	}
351}
352
353#
354# generate_delta($stat, $newstat) takes two stat references, returned from
355# getstat(), and creates a %delta. %delta (not surprisingly) contains the
356# same basic info as stat and newstat, but with the timestamps as deltas
357# instead of absolute times. We return a reference to the delta.
358#
359
360sub generate_delta($$)
361{
362	my ($stat, $newstat) = @_;
363
364	my %delta = ();
365	my $intrload;
366	my $intrnsec;
367	my $cpus;
368
369	# Take the worstcase timerange
370	$delta{minsnap} = $stat->{snaptime};
371	$delta{maxsnap} = $newstat->{snaptime};
372	if (VERIFY($delta{maxsnap} > $delta{minsnap},
373	    "generate_delta: stats aren't ascending")) {
374		$delta{missing} = 1;
375		return (\%delta);
376	}
377
378	# if there are a different number of cpus in the stats, set missing
379
380	$delta{missing} = (keys(%$stat) != keys(%$newstat));
381	if (VERIFY($delta{missing} == 0,
382	    "generate_delta: number of CPUs changed")) {
383		return (\%delta);
384	}
385
386	# scan through every cpu in %newstat and compare against %stat
387
388	while (my ($cpu, $newcpst) = each %$newstat) {
389		next if !ref($newcpst);		# skip non-cpuid fields
390
391		# If %stat is missing a cpu from %newstat, then it was just
392		# onlined. Mark missing.
393
394		if (VERIFY(exists $stat->{$cpu} &&
395		    $stat->{$cpu}{crtime} == $newcpst->{crtime},
396		    "generate_delta: cpu $cpu changed")) {
397			$delta{missing} = 1;
398			return (\%delta);
399		}
400		my $cpst = $stat->{$cpu};
401		$delta{$cpu}{tot} = $newcpst->{tot} - $cpst->{tot};
402		if (VERIFY($delta{$cpu}{tot} >= 0,
403		    "generate_delta: deltas are not ascending?")) {
404			$delta{missing} = 1;
405			delete($delta{$cpu});
406			return (\%delta);
407		}
408		# Avoid remote chance of division by zero
409		$delta{$cpu}{tot} = 1 if $delta{$cpu}{tot} == 0;
410		$delta{$cpu}{intrs} = 0;
411		$delta{$cpu}{bigintr} = 0;
412
413		my %ivecs = ();
414		$delta{$cpu}{ivecs} = \%ivecs;
415
416		# if the number of ivecs differs, set missing
417
418		if (VERIFY(keys(%{$cpst->{ivecs}}) ==
419			   keys(%{$newcpst->{ivecs}}),
420			   "generate_delta: cpu $cpu has more/less".
421			   " interrupts")) {
422			$delta{missing} = 1;
423			return (\%delta);
424		}
425
426		while (my ($inum, $newivec) = each %{$newcpst->{ivecs}}) {
427			# If this ivec doesn't exist in $stat, or if $stat
428			# shows a different crtime, set missing.
429
430			if (VERIFY(exists $cpst->{ivecs}{$inum} &&
431				   $cpst->{ivecs}{$inum}{crtime} ==
432				   $newivec->{crtime},
433				   "generate_delta: cpu $cpu inum $inum".
434				   " has changed")) {
435				$delta{missing} = 1;
436				return (\%delta);
437			}
438			my $ivec = $cpst->{ivecs}{$inum};
439
440			# Create $delta{$cpu}{ivecs}{$inum}.
441
442			my %dltivec = ();
443			$delta{$cpu}{ivecs}{$inum} = \%dltivec;
444
445			# calculate time used by this interrupt
446
447			my $time = $newivec->{time} - $ivec->{time};
448			if (VERIFY($time >= 0,
449				   "generate_delta: ivec went backwards?")) {
450				$delta{missing} = 1;
451				delete($delta{$cpu}{ivecs}{$inum});
452				return (\%delta);
453			}
454			$delta{$cpu}{intrs} += $time;
455			$dltivec{time} = $time;
456			if ($time > $delta{$cpu}{bigintr}) {
457				$delta{$cpu}{bigintr} = $time;
458			}
459
460			# Transfer over basic info about the kstat. We
461			# don't have to worry about discrepancies between
462			# ivec and newivec because we verified that both
463			# have the same crtime.
464
465			$dltivec{pil} = $newivec->{pil};
466			$dltivec{ino} = $newivec->{ino};
467			$dltivec{buspath} = $newivec->{buspath};
468			$dltivec{name} = $newivec->{name};
469			$dltivec{ihs} = $newivec->{ihs};
470		}
471		if ($delta{$cpu}{tot} < $delta{$cpu}{intrs}) {
472			# Ewww! Hopefully just a rounding error.
473			# Make something up.
474			$delta{$cpu}{tot} = $delta{$cpu}{intrs};
475		}
476		$delta{$cpu}{intrload} =
477		       $delta{$cpu}{intrs} / $delta{$cpu}{tot};
478		$intrload += $delta{$cpu}{intrload};
479		$intrnsec += $delta{$cpu}{intrs};
480		$cpus++;
481	}
482	if ($cpus > 0) {
483		$delta{avgintrload} = $intrload / $cpus;
484		$delta{avgintrnsec} = $intrnsec / $cpus;
485	} else {
486		$delta{avgintrload} = 0;
487		$delta{avgintrnsec} = 0;
488	}
489	return (\%delta);
490}
491
492
493# compress_delta takes a list of deltas, and returns a single new delta
494# which represents the combined information from all the deltas. The deltas
495# provided are assumed to be sequential in time. The resulting compressed
496# delta looks just like any other delta. This new delta is also more accurate
497# since its statistics are averaged over a longer period than any of the
498# original deltas.
499
500sub compress_deltas ($)
501{
502	my ($deltas) = @_;
503
504	my %newdelta = ();
505	my ($intrs, $tot);
506	my $cpus = 0;
507
508	if (VERIFY($#$deltas != -1,
509		   "compress_deltas: list of delta is empty?")) {
510		return (0);
511	}
512	$newdelta{minsnap} = $deltas->[0]{minsnap};
513	$newdelta{maxsnap} = $deltas->[$#$deltas]{maxsnap};
514	$newdelta{missing} = 0;
515
516	foreach my $delta (@$deltas) {
517		if (VERIFY($delta->{missing} == 0,
518		    "compressing bad deltas?")) {
519			return (0);
520		}
521		while (my ($cpuid, $cpu) = each %$delta) {
522			next if !ref($cpu);
523
524			$intrs += $cpu->{intrs};
525			$tot += $cpu->{tot};
526			$newdelta{$cpuid}{intrs} += $cpu->{intrs};
527			$newdelta{$cpuid}{tot} += $cpu->{tot};
528			if (!exists $newdelta{$cpuid}{ivecs}) {
529				my %ivecs = ();
530				$newdelta{$cpuid}{ivecs} = \%ivecs;
531			}
532			while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
533				my $newivecs = $newdelta{$cpuid}{ivecs};
534				$newivecs->{$inum}{time} += $ivec->{time};
535				$newivecs->{$inum}{pil} = $ivec->{pil};
536				$newivecs->{$inum}{ino} = $ivec->{ino};
537				$newivecs->{$inum}{buspath} = $ivec->{buspath};
538				$newivecs->{$inum}{name} = $ivec->{name};
539				$newivecs->{$inum}{ihs} = $ivec->{ihs};
540			}
541		}
542	}
543	foreach my $cpu (values(%newdelta)) {
544		next if !ref($cpu); # ignore non-cpu fields
545		$cpus++;
546
547		my $bigintr = 0;
548		foreach my $ivec (values(%{$cpu->{ivecs}})) {
549			if ($ivec->{time} > $bigintr) {
550				$bigintr = $ivec->{time};
551			}
552		}
553		$cpu->{bigintr} = $bigintr;
554		$cpu->{intrload} = $cpu->{intrs} / $cpu->{tot};
555		$cpu->{tot} = 1 if $cpu->{tot} <= 0;
556	}
557	if ($cpus == 0) {
558		$newdelta{avgintrnsec} = 0;
559		$newdelta{avgintrload} = 0;
560	} else {
561		$newdelta{avgintrnsec} = $intrs / $cpus;
562		$newdelta{avgintrload} = $intrs / $tot;
563	}
564	return (\%newdelta);
565}
566
567
568
569
570
571# What follow are the core functions responsible for examining the deltas
572# generated above and deciding what to do about them.
573#
574# goodness() and its helper goodness_cpu() return a heuristic which describe
575# how good (or bad) the current interrupt balance is. The value returned will
576# be between 0 and 1, with 0 representing maximum goodness, and 1 representing
577# maximum badness.
578#
579# imbalanced() compares a current and historical value of goodness, and
580# determines if there has been enough change to warrant evaluating a
581# reconfiguration of the interrupts
582#
583# do_reconfig(), and its helpers, do_reconfig_cpu(), do_reconfig_cpu2cpu(),
584# find_goal(), do_find_goal(), and move_intr(), are responsible for examining
585# a delta and determining the best possible assignment of interrupts to CPUs.
586#
587# It is important that do_reconfig() be in alignment with goodness(). If
588# do_reconfig were to generate a new interrupt distribution that worsened
589# goodness, we could get into a pathological loop with intrd fighting itself,
590# constantly deciding that things are imbalanced, and then changing things
591# only to make them worse.
592
593
594
595# any goodness over $goodness_unsafe_load is considered really bad
596# goodness must drop by at least $goodness_mindelta for a reconfig
597
598my $goodness_unsafe_load = .9;
599my $goodness_mindelta = .1;
600
601# goodness(%delta) examines a delta and return its "goodness". goodness will
602# be between 0 (best) and 1 (major bad). goodness is determined by evaluating
603# the goodness of each individual cpu, and returning the worst case. This
604# helps on systems with many CPUs, where otherwise a single pathological CPU
605# might otherwise be ignored because the average was OK.
606#
607# To calculate the goodness of an individual CPU, we start by looking at its
608# load due to interrupts. If the load is above a certain high threshold and
609# there is more than one interrupt assigned to this CPU, we set goodness
610# to worst-case. If the load is below the average interrupt load of all CPUs,
611# then we return best-case, since what's to complain about?
612#
613# Otherwise we look at how much the load is above the average, and return
614# that as the goodness, with one caveat: we never return more than the CPU's
615# interrupt load ignoring its largest single interrupt source. This is
616# because a CPU with one high-load interrupt, and no other interrupts, is
617# perfectly balanced. Nothing can be done to improve the situation, and thus
618# it is perfectly balanced even if the interrupt's load is 100%.
619
620sub goodness($)
621{
622	my ($delta) = @_;
623
624	return (1) if $delta->{missing} > 0;
625
626	my $high_goodness = 0;
627	my $goodness;
628
629	foreach my $cpu (values(%$delta)) {
630		next if !ref($cpu);		# skip non-cpuid fields
631
632		$goodness = goodness_cpu($cpu, $delta->{avgintrload});
633		if (VERIFY($goodness >= 0 && $goodness <= 1,
634			   "goodness: cpu goodness out of range?")) {
635			dumpdelta($delta);
636			return (1);
637		}
638		if ($goodness == 1) {
639			return (1);	# worst case, no need to continue
640		}
641		if ($goodness > $high_goodness) {
642			$high_goodness = $goodness;
643		}
644	}
645	return ($high_goodness);
646}
647
648sub goodness_cpu($$)		# private function
649{
650	my ($cpu, $avgintrload) = @_;
651
652	my $goodness;
653	my $load = $cpu->{intrs} / $cpu->{tot};
654
655	return (0) if ($load < $avgintrload);	# low loads are perfectly good
656
657	# Calculate $load_no_bigintr, which represents the load
658	# due to interrupts, excluding the one biggest interrupt.
659	# This is the most gain we can get on this CPU from
660	# offloading interrupts.
661
662	my $load_no_bigintr = ($cpu->{intrs} - $cpu->{bigintr}) / $cpu->{tot};
663
664	# A major imbalance is indicated if a CPU is saturated
665	# with interrupt handling, and it has more than one
666	# source of interrupts. Those other interrupts could be
667	# starved if of a lower pil. Return a goodness of 1,
668	# which is the worst possible return value,
669	# which will effectively contaminate this entire delta.
670
671	my $cnt = keys(%{$cpu->{ivecs}});
672
673	if ($load > $goodness_unsafe_load && $cnt > 1) {
674		return (1);
675	}
676	$goodness = $load - $avgintrload;
677	if ($goodness > $load_no_bigintr) {
678		$goodness = $load_no_bigintr;
679	}
680	return ($goodness);
681}
682
683
684# imbalanced() is used by the main routine to determine if the goodness
685# has shifted far enough from our last baseline to warrant a reassignment
686# of interrupts. A very high goodness indicates that a CPU is way out of
687# whack. If the goodness has varied too much since the baseline, then
688# perhaps a reconfiguration is worth considering.
689
690sub imbalanced ($$)
691{
692	my ($goodness, $baseline) = @_;
693
694	# Return 1 if we are pathological, or creeping away from the baseline
695
696	return (1) if $goodness > .50;
697	return (1) if abs($goodness - $baseline) > $goodness_mindelta;
698	return (0);
699}
700
701# do_reconfig(), do_reconfig_cpu(), and do_reconfig_cpu2cpu(), are the
702# decision-making functions responsible for generating a new interrupt
703# distribution. They are designed with the definition of goodness() in
704# mind, i.e. they use the same definition of "good distribution" as does
705# goodness().
706#
707# do_reconfig() is responsible for deciding whether a redistribution is
708# actually warranted. If the goodness is already pretty good, it doesn't
709# waste the CPU time to generate a new distribution. If it
710# calculates a new distribution and finds that it is not sufficiently
711# improved from the prior distirbution, it will not do the redistribution,
712# mainly to avoid the disruption to system performance caused by
713# rejuggling interrupts.
714#
715# Its main loop works by going through a list of cpus sorted from
716# highest to lowest interrupt load. It removes the highest-load cpus
717# one at a time and hands them off to do_reconfig_cpu(). This function
718# then re-sorts the remaining CPUs from lowest to highest interrupt load,
719# and one at a time attempts to rejuggle interrupts between the original
720# high-load CPU and the low-load CPU. Rejuggling on a high-load CPU is
721# considered finished as soon as its interrupt load is within
722# $goodness_mindelta of the average interrupt load. Such a CPU will have
723# a goodness of below the $goodness_mindelta threshold.
724
725#
726# move_intr(\%delta, $inum, $oldcpu, $newcpu)
727# used by reconfiguration code to move an interrupt between cpus within
728# a delta. This manipulates data structures, and does not actually move
729# the interrupt on the running system.
730#
731sub move_intr($$$$)		# private function
732{
733	my ($delta, $inum, $oldcpuid, $newcpuid) = @_;
734
735	my $ivec = $delta->{$oldcpuid}{ivecs}{$inum};
736
737	# Remove ivec from old cpu
738
739	my $oldcpu = $delta->{$oldcpuid};
740	$oldcpu->{intrs} -= $ivec->{time};
741	$oldcpu->{intrload} = $oldcpu->{intrs} / $oldcpu->{tot};
742	delete($oldcpu->{ivecs}{$inum});
743
744	VERIFY($oldcpu->{intrs} >= 0, "move_intr: intr's time > total time?");
745	VERIFY($ivec->{time} <= $oldcpu->{bigintr},
746	       "move_intr: intr's time > bigintr?");
747
748	if ($ivec->{time} >= $oldcpu->{bigintr}) {
749		my $bigtime = 0;
750
751		foreach my $ivec (values(%{$oldcpu->{ivecs}})) {
752			$bigtime = $ivec->{time} if $ivec->{time} > $bigtime;
753		}
754		$oldcpu->{bigintr} = $bigtime;
755	}
756
757	# Add ivec onto new cpu
758
759	my $newcpu = $delta->{$newcpuid};
760
761	$ivec->{nowcpu} = $newcpuid;
762	$newcpu->{intrs} += $ivec->{time};
763	$newcpu->{intrload} = $newcpu->{intrs} / $newcpu->{tot};
764	$newcpu->{ivecs}{$inum} = $ivec;
765
766	$newcpu->{bigintr} = $ivec->{time}
767		if $ivec->{time} > $newcpu->{bigintr};
768}
769
770sub move_intr_check($$$)	# private function
771{
772	my ($delta, $oldcpuid, $newcpuid) = @_;
773
774	VERIFY($delta->{$oldcpuid}{tot} >= $delta->{$oldcpuid}{intrs},
775	       "Moved interrupts left 100+%% load on src cpu");
776	VERIFY($delta->{$newcpuid}{tot} >= $delta->{$newcpuid}{intrs},
777	       "Moved interrupts left 100+%% load on tgt cpu");
778}
779
780sub ivecs_to_string(@)		# private function
781{
782	my $str = "";
783	foreach my $ivec (@_) {
784		$str = "$str $ivec->{inum}";
785	}
786	return ($str);
787}
788
789
790sub do_reconfig($)
791{
792	my ($delta) = @_;
793
794	my $goodness = $delta->{goodness};
795
796	# We can't improve goodness to better than 0. We should stop here
797	# if, even if we achieve a goodness of 0, the improvement is still
798	# too small to merit the action.
799
800	if ($goodness - 0 < $goodness_mindelta) {
801		syslog('debug', "goodness good enough, don't reconfig");
802		return (0);
803	}
804
805	syslog('notice', "Optimizing interrupt assignments");
806
807	if (VERIFY ($delta->{missing} == 0, "RECONFIG Aborted: should not ".
808	    "have a delta with missing")) {
809		return (-1);
810	}
811
812	# Make a list of all cpuids, and also add some extra information
813	# to the ivec structures.
814
815	my @cpusortlist = ();
816
817	while (my ($cpuid, $cpu) = each %$delta) {
818		next if !ref($cpu);	# skip non-cpu entries
819
820		push(@cpusortlist, $cpuid);
821		while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
822			$ivec->{origcpu} = $cpuid;
823			$ivec->{nowcpu} = $cpuid;
824			$ivec->{inum} = $inum;
825		}
826	}
827
828	# Sort the list of CPUs from highest to lowest interrupt load.
829	# Remove the top CPU from that list and attempt to redistribute
830	# its interrupts. If the CPU has a goodness below a threshold,
831	# just ignore the CPU and move to the next one. If the CPU's
832	# load falls below the average load plus that same threshold,
833	# then there are no CPUs left worth reconfiguring, and we're done.
834
835	while (@cpusortlist) {
836		# Re-sort cpusortlist each time, since do_reconfig_cpu can
837		# move interrupts around.
838
839		@cpusortlist =
840		    sort({$delta->{$b}{intrload} <=> $delta->{$a}{intrload}}
841		    @cpusortlist);
842
843		my $cpu = shift(@cpusortlist);
844		if (($delta->{$cpu}{intrload} <= $goodness_unsafe_load) &&
845		    ($delta->{$cpu}{intrload} <=
846		    $delta->{avgintrload} + $goodness_mindelta)) {
847			syslog('debug', "finished reconfig: cpu $cpu load ".
848			    "$delta->{$cpu}{intrload} avgload ".
849			    "$delta->{avgintrload}");
850			last;
851		}
852		if (goodness_cpu($delta->{$cpu}, $delta->{avgintrload}) <
853		    $goodness_mindelta) {
854			next;
855		}
856		do_reconfig_cpu($delta, \@cpusortlist, $cpu);
857	}
858
859	# How good a job did we do? If the improvement was minimal, and
860	# our goodness wasn't pathological (and thus needing any help it
861	# can get), then don't bother moving the interrupts.
862
863	my $newgoodness = goodness($delta);
864	VERIFY($newgoodness <= $goodness,
865	       "reconfig: result has worse goodness?");
866
867	if (($goodness != 1 || $newgoodness == 1) &&
868	    $goodness - $newgoodness < $goodness_mindelta) {
869		syslog('debug', "goodness already near optimum, ".
870		       "don't reconfig");
871		return (0);
872	}
873	syslog('debug', "goodness %5.2f%% --> %5.2f%%", $goodness*100,
874	       $newgoodness*100);
875
876	# Time to move those interrupts!
877
878	my $ret = 1;
879	my $warned = 0;
880	while (my ($cpuid, $cpu) = each %$delta) {
881		next if $cpuid =~ /\D/;
882		while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
883			next if ($ivec->{origcpu} == $cpuid);
884
885			if (!intrmove($ivec->{buspath}, $ivec->{ino},
886			    $cpuid)) {
887				syslog('warning', "Unable to move interrupts")
888				    if $warned++ == 0;
889				syslog('debug', "Unable to move buspath ".
890				    "$ivec->{buspath} ino $ivec->{ino} to ".
891				    "cpu $cpuid");
892				$ret = -1;
893			}
894		}
895	}
896
897	syslog('notice', "Interrupt assignments optimized");
898	return ($ret);
899}
900
901sub do_reconfig_cpu($$$)	# private function
902{
903	my ($delta, $cpusortlist, $oldcpuid) = @_;
904
905	# We have been asked to rejuggle interrupts between $oldcpuid and
906	# other CPUs found on $cpusortlist so as to improve the load on
907	# $oldcpuid. We reverse $cpusortlist to get our own copy of the
908	# list, sorted from lowest to highest interrupt load. One at a
909	# time, shift a CPU off of this list of CPUs, and attempt to
910	# rejuggle interrupts between the two CPUs. Don't do this if the
911	# other CPU has a higher load than oldcpuid. We're done rejuggling
912	# once $oldcpuid's goodness falls below a threshold.
913
914	syslog('debug', "reconfiguring $oldcpuid");
915
916	my $cpu = $delta->{$oldcpuid};
917	my $avgintrload = $delta->{avgintrload};
918
919	my @cputargetlist = reverse(@$cpusortlist); # make a copy of the list
920	while ($#cputargetlist != -1) {
921 		last if goodness_cpu($cpu, $avgintrload) < $goodness_mindelta;
922
923		my $tgtcpuid = shift(@cputargetlist);
924		my $tgt = $delta->{$tgtcpuid};
925		my $load = $cpu->{intrload};
926		my $tgtload = $tgt->{intrload};
927		last if $tgtload > $load;
928		do_reconfig_cpu2cpu($delta, $oldcpuid, $tgtcpuid, $load);
929	}
930}
931
932sub do_reconfig_cpu2cpu($$$$)	# private function
933{
934	my ($delta, $srccpuid, $tgtcpuid, $srcload) = @_;
935
936	# We've been asked to consider interrupt juggling between srccpuid
937	# (with a high interrupt load) and tgtcpuid (with a lower interrupt
938	# load). First, make a single list with all of the ivecs from both
939	# CPUs, and sort the list from highest to lowest load.
940
941	syslog('debug', "exchanging intrs between $srccpuid and $tgtcpuid");
942
943	# Gather together all the ivecs and sort by load
944
945	my @ivecs = (values(%{$delta->{$srccpuid}{ivecs}}),
946	    values(%{$delta->{$tgtcpuid}{ivecs}}));
947	return if $#ivecs == -1;
948
949	@ivecs = sort({$b->{time} <=> $a->{time}} @ivecs);
950
951	# Our "goal" load for srccpuid is the average load across all CPUs.
952	# find_goal() will find determine the optimum selection of the
953	# available interrupts which comes closest to this goal without
954	# falling below the goal.
955
956	my $goal = $delta->{avgintrnsec};
957
958	# We know that the interrupt load on tgtcpuid is less than that on
959	# srccpuid, but its load could still be above avgintrnsec. Don't
960	# choose a goal which would bring srccpuid below the load on tgtcpuid.
961
962	my $avgnsec =
963	    ($delta->{$srccpuid}{intrs} + $delta->{$tgtcpuid}{intrs}) / 2;
964	if ($goal < $avgnsec) {
965		$goal = $avgnsec;
966	}
967
968	# If the largest of the interrupts is on srccpuid, leave it there.
969	# This can help minimize the disruption caused by moving interrupts.
970
971	if ($ivecs[0]->{origcpu} == $srccpuid) {
972		syslog('debug', "Keeping $ivecs[0]->{inum} on $srccpuid");
973		$goal -= $ivecs[0]->{time};
974		shift(@ivecs);
975	}
976
977	syslog('debug', "GOAL: inums should total $goal");
978	find_goal(\@ivecs, $goal);
979
980	# find_goal() returned its results to us by setting $ivec->{goal} if
981	# the ivec should be on srccpuid, or clearing it for tgtcpuid.
982	# Call move_intr() to update our $delta with the new results.
983
984	foreach my $ivec (@ivecs) {
985		syslog('debug', "ivec $ivec->{inum} goal $ivec->{goal}");
986		VERIFY($ivec->{nowcpu} == $srccpuid ||
987		    $ivec->{nowcpu} == $tgtcpuid, "cpu2cpu found an ".
988		    "interrupt not currently on src or tgt cpu");
989
990		if ($ivec->{goal} && $ivec->{nowcpu} != $srccpuid) {
991			move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
992			    $srccpuid);
993		} elsif ($ivec->{goal} == 0 && $ivec->{nowcpu} != $tgtcpuid) {
994			move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
995			    $tgtcpuid);
996		}
997	}
998	move_intr_check($delta, $srccpuid, $tgtcpuid); # asserts
999
1000	my $newload = $delta->{$srccpuid}{intrs} / $delta->{$srccpuid}{tot};
1001	VERIFY($newload <= $srcload && $newload > $delta->{avgintrload},
1002	    "cpu2cpu: new load didn't end up in expected range");
1003}
1004
1005
1006# find_goal() and its helper do_find_goal() are used to find the best
1007# combination of interrupts in order to generate a load that is as close
1008# as possible to a goal load without falling below that goal. Before returning
1009# to its caller, find_goal() sets a new value in the hash of each interrupt,
1010# {goal}, which if set signifies that this interrupt is one of the interrupts
1011# identified as part of the set of interrupts which best meet the goal.
1012#
1013# The arguments to find_goal are a list of ivecs (hash references), sorted
1014# by descending {time}, and the goal load. The goal is relative to {time}.
1015# The best fit is determined by performing a depth-first search. do_find_goal
1016# is the recursive subroutine which carries out the search.
1017#
1018# It is passed an index as an argument, originally 0. On a given invocation,
1019# it is only to consider interrupts in the ivecs array starting at that index.
1020# It then considers two possibilities:
1021#   1) What is the best goal-fit if I include ivecs[index]?
1022#   2) What is the best goal-fit if I exclude ivecs[index]?
1023# To determine case 1, it subtracts the load of ivecs[index] from the goal,
1024# and calls itself recursively with that new goal and index++.
1025# To determine case 2, it calls itself recursively with the same goal and
1026# index++.
1027#
1028# It then compares the two results, decide which one best meets the goals,
1029# and returns the result. The return value is the best-fit's interrupt load,
1030# followed by a list of all the interrupts which make up that best-fit.
1031#
1032# As an optimization, a second array loads[] is created which mirrors ivecs[].
1033# loads[i] will equal the total loads of all ivecs[i..$#ivecs]. This is used
1034# by do_find_goal to avoid recursing all the way to the end of the ivecs
1035# array if including all remaining interrupts will still leave the best-fit
1036# at below goal load. If so, it then includes all remaining interrupts on
1037# the goal list and returns.
1038#
1039sub find_goal($$)		# private function
1040{
1041	my ($ivecs, $goal) = @_;
1042
1043	my @goals;
1044	my $load;
1045	my $ivec;
1046
1047	if ($goal <= 0) {
1048		@goals = ();	# the empty set will best meet the goal
1049	} else {
1050		syslog('debug', "finding goal from intrs %s",
1051		    ivecs_to_string(@$ivecs));
1052
1053		# Generate @loads array
1054
1055		my $tot = 0;
1056		foreach $ivec (@$ivecs) {
1057			$tot += $ivec->{time};
1058		}
1059		my @loads = ();
1060		foreach $ivec (@$ivecs) {
1061			push(@loads, $tot);
1062			$tot -= $ivec->{time};
1063		}
1064		($load, @goals) = do_find_goal($ivecs, \@loads, $goal, 0);
1065	}
1066	VERIFY($load >= $goal, "find_goal didn't meet goals");
1067	syslog('debug', "goals found: %s", ivecs_to_string(@goals));
1068
1069	# Set or clear $ivec->{goal} for each ivec, based on returned @goals
1070
1071	foreach $ivec (@$ivecs) {
1072		if ($#goals > -1 && $ivec == $goals[0]) {
1073			syslog('debug', "inum $ivec->{inum} on source cpu");
1074			$ivec->{goal} = 1;
1075			shift(@goals);
1076		} else {
1077			syslog('debug', "inum $ivec->{inum} on target cpu");
1078			$ivec->{goal} = 0;
1079		}
1080	}
1081}
1082
1083
1084sub do_find_goal($$$$)		# private function
1085{
1086	my ($ivecs, $loads, $goal, $idx) = @_;
1087
1088	if ($idx > $#{$ivecs}) {
1089		return (0);
1090	}
1091	syslog('debug', "$idx: finding goal $goal inum $ivecs->[$idx]{inum}");
1092
1093	my $load = $ivecs->[$idx]{time};
1094	my @goals_with = ();
1095	my @goals_without = ();
1096	my ($with, $without);
1097
1098	# If we include all remaining items and we're still below goal,
1099	# stop here. We can just return a result that includes $idx and all
1100	# subsequent ivecs. Since this will still be below goal, there's
1101	# nothing better to be done.
1102
1103	if ($loads->[$idx] <= $goal) {
1104		syslog('debug',
1105		    "$idx: including all remaining intrs %s with load %d",
1106		    ivecs_to_string(@$ivecs[$idx .. $#{$ivecs}]),
1107		    $loads->[$idx]);
1108		return ($loads->[$idx], @$ivecs[$idx .. $#{$ivecs}]);
1109	}
1110
1111	# Evaluate the "with" option, i.e. the best matching goal which
1112	# includes $ivecs->[$idx]. If idx's load is more than our goal load,
1113	# stop here. Once we're above the goal, there is no need to consider
1114	# further interrupts since they'll only take us further from the goal.
1115
1116	if ($goal <= $load) {
1117		$with = $load;	# stop here
1118	} else {
1119		($with, @goals_with) =
1120		    do_find_goal($ivecs, $loads, $goal - $load, $idx + 1);
1121		$with += $load;
1122	}
1123	syslog('debug', "$idx: with-load $with intrs %s",
1124	       ivecs_to_string($ivecs->[$idx], @goals_with));
1125
1126	# Evaluate the "without" option, i.e. the best matching goal which
1127	# excludes $ivecs->[$idx].
1128
1129	($without, @goals_without) =
1130	    &do_find_goal($ivecs, $loads, $goal, $idx + 1);
1131	syslog('debug', "$idx: without-load $without intrs %s",
1132	       ivecs_to_string(@goals_without));
1133
1134	# We now have our "with" and "without" options, and we choose which
1135	# best fits the goal. If one is greater than goal and the other is
1136	# below goal, we choose the one that is greater. If they are both
1137	# below goal, then we choose the one that is greater. If they are
1138	# both above goal, then we choose the smaller.
1139
1140	my $which;		# 0 == with, 1 == without
1141	if ($with >= $goal && $without < $goal) {
1142		$which = 0;
1143	} elsif ($with < $goal && $without >= $goal) {
1144		$which = 1;
1145	} elsif ($with >= $goal && $without >= $goal) {
1146		$which = ($without < $with);
1147	} else {
1148		$which = ($without > $with);
1149	}
1150
1151	# Return the load of our best case scenario, followed by all the ivecs
1152	# which compose that goal.
1153
1154	if ($which == 1) {	# without
1155		syslog('debug', "$idx: going without");
1156		return ($without, @goals_without);
1157	} else {
1158		syslog('debug', "$idx: going with");
1159		return ($with, $ivecs->[$idx], @goals_with);
1160	}
1161	# Not reached
1162}
1163
1164
1165
1166
1167syslog('debug', "intrd is starting".($debug ? " (debug)" : ""));
1168
1169my @deltas = ();
1170my $deltas_tottime = 0;		# sum of maxsnap-minsnap across @deltas
1171my $avggoodness;
1172my $baseline_goodness = 0;
1173my $compdelta;
1174
1175my $do_reconfig;
1176
1177# temp variables
1178my $goodness;
1179my $deltatime;
1180my $olddelta;
1181my $olddeltatime;
1182my $delta;
1183my $newstat;
1184my $below_statslen;
1185my $newtime;
1186my $ret;
1187
1188
1189my $gotsig = 0;
1190$SIG{INT} = sub { $gotsig = 1; };     # don't die in the middle of retargeting
1191$SIG{HUP} = $SIG{INT};
1192$SIG{TERM} = $SIG{INT};
1193
1194my $ks;
1195if ($using_scengen == 0) {
1196	$ks = Sun::Solaris::Kstat->new();
1197} else {
1198	$ks = myks_update();	# supplied by the simulator
1199}
1200
1201# If no pci_intrs kstats were found, we need to exit, but we can't because
1202# SMF will restart us and/or report an error to the administrator. But
1203# there's nothing an administrator can do. So print out a message for SMF
1204# logs and silently pause forever.
1205
1206if (!exists($ks->{pci_intrs})) {
1207	print STDERR "$cmdname: no interrupts were found; ".
1208	    "your PCI bus may not yet be supported\n";
1209	pause() while $gotsig == 0;
1210	exit 0;
1211}
1212
1213my $stat = getstat($ks);
1214
1215
1216
1217for (;;) {
1218	sub clear_deltas {
1219		@deltas = ();
1220		$deltas_tottime = 0;
1221		$stat = 0;   # prevent next gen_delta() from setting {missing}
1222	}
1223
1224	# 1. Sleep, update the kstats, and save the new stats in $newstat.
1225
1226	exit 0 if $gotsig;		# if we got ^C / SIGTERM, exit
1227	if ($using_scengen == 0) {
1228		sleep($sleeptime);
1229		exit 0 if $gotsig;	# if we got ^C / SIGTERM, exit
1230		$ks->update();
1231	} else {
1232		$ks = myks_update();
1233	}
1234	$newstat = getstat($ks);
1235
1236	# $stat or $newstat could be zero if they're uninitialized, or if
1237	# getstat() failed. If $stat is zero, move $newstat to $stat, sleep
1238	# and try again. If $newstat is zero, then we also sleep and try
1239	# again, hoping the problem will clear up.
1240
1241	next if (!ref $newstat);
1242	if (!ref $stat) {
1243		$stat = $newstat;
1244		next;
1245	}
1246
1247
1248	# 2. Compare $newstat with the prior set of values, result in %$delta.
1249
1250	$delta = generate_delta($stat, $newstat);
1251	dumpdelta($delta) if $debug;	# Dump most recent stats to stdout.
1252	$stat = $newstat;	# The new stats now become the old stats.
1253
1254
1255	# 3. If $delta->{missing}, then there has been a reconfiguration of
1256	# either cpus or interrupts (probably both). We need to toss out our
1257	# old set of statistics and start from scratch.
1258	#
1259	# Also, if the delta covers a very long range of time, then we've
1260	# been experiencing a system overload that has resulted in intrd
1261	# not being allowed to run effectively for a while now. As above,
1262	# toss our old statistics and start from scratch.
1263
1264	$deltatime = $delta->{maxsnap} - $delta->{minsnap};
1265	if ($delta->{missing} > 0 || $deltatime > $statslen) {
1266		clear_deltas();
1267		syslog('debug', "evaluating interrupt assignments");
1268		next;
1269	}
1270
1271
1272	# 4. Incorporate new delta into the list of deltas, and associated
1273	# statistics. If we've just now received $statslen deltas, then it's
1274	# time to evaluate a reconfiguration.
1275
1276	$below_statslen = ($deltas_tottime < $statslen);
1277	$deltas_tottime += $deltatime;
1278	$do_reconfig = ($below_statslen && $deltas_tottime >= $statslen);
1279	push(@deltas, $delta);
1280
1281	# 5. Remove old deltas if total time is more than $statslen. We use
1282	# @deltas as a moving average of the last $statslen seconds. Shift
1283	# off the olders deltas, but only if that doesn't cause us to fall
1284	# below $statslen seconds.
1285
1286	while (@deltas > 1) {
1287		$olddelta = $deltas[0];
1288		$olddeltatime = $olddelta->{maxsnap} - $olddelta->{minsnap};
1289		$newtime = $deltas_tottime - $olddeltatime;
1290		last if ($newtime < $statslen);
1291
1292		shift(@deltas);
1293		$deltas_tottime = $newtime;
1294	}
1295
1296	# 6. The brains of the operation are here. First, check if we're
1297	# imbalanced, and if so set $do_reconfig. If $do_reconfig is set,
1298	# either because of imbalance or above in step 4, we evaluate a
1299	# new configuration.
1300	#
1301	# First, take @deltas and generate a single "compressed" delta
1302	# which summarizes them all. Pass that to do_reconfig and see
1303	# what it does with it:
1304	#
1305	# $ret == -1 : failure
1306	# $ret ==  0 : current config is optimal (or close enough)
1307	# $ret ==  1 : reconfiguration has occurred
1308	#
1309	# If $ret is -1 or 1, dump all our deltas and start from scratch.
1310	# Step 4 above will set do_reconfig soon thereafter.
1311	#
1312	# If $ret is 0, then nothing has happened because we're already
1313	# good enough. Set baseline_goodness to current goodness.
1314
1315	$compdelta = compress_deltas(\@deltas);
1316	if (VERIFY(ref($compdelta) eq "HASH", "couldn't compress deltas")) {
1317		clear_deltas();
1318		next;
1319	}
1320	$compdelta->{goodness} = goodness($compdelta);
1321	dumpdelta($compdelta) if $debug;
1322
1323	$goodness = $compdelta->{goodness};
1324	syslog('debug', "GOODNESS: %5.2f%%", $goodness * 100);
1325
1326	if ($deltas_tottime >= $statslen &&
1327	    imbalanced($goodness, $baseline_goodness)) {
1328		$do_reconfig = 1;
1329	}
1330
1331	if ($do_reconfig) {
1332		$ret = do_reconfig($compdelta);
1333
1334		if ($ret != 0) {
1335			clear_deltas();
1336			syslog('debug', "do_reconfig FAILED!") if $ret == -1;
1337		} else {
1338			syslog('debug', "setting new baseline of $goodness");
1339			$baseline_goodness = $goodness;
1340		}
1341	}
1342	syslog('debug', "---------------------------------------");
1343}
1344