xref: /titanic_52/usr/src/cmd/intrd/intrd.pl (revision df0345f7d6cc87cde9e532e8362f1aca053d98cc)
1bd335c64Sesolom#!/usr/perl5/bin/perl
2bd335c64Sesolom#
3bd335c64Sesolom# CDDL HEADER START
4bd335c64Sesolom#
5bd335c64Sesolom# The contents of this file are subject to the terms of the
6d89fccd8Sschwartz# Common Development and Distribution License (the "License").
7d89fccd8Sschwartz# You may not use this file except in compliance with the License.
8bd335c64Sesolom#
9bd335c64Sesolom# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10bd335c64Sesolom# or http://www.opensolaris.org/os/licensing.
11bd335c64Sesolom# See the License for the specific language governing permissions
12bd335c64Sesolom# and limitations under the License.
13bd335c64Sesolom#
14bd335c64Sesolom# When distributing Covered Code, include this CDDL HEADER in each
15bd335c64Sesolom# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16bd335c64Sesolom# If applicable, add the following below this CDDL HEADER, with the
17bd335c64Sesolom# fields enclosed by brackets "[]" replaced with your own identifying
18bd335c64Sesolom# information: Portions Copyright [yyyy] [name of copyright owner]
19bd335c64Sesolom#
20bd335c64Sesolom# CDDL HEADER END
21bd335c64Sesolom#
22bd335c64Sesolom
23bd335c64Sesolom#
24*df0345f7SJohn Sonnenschein# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
25bd335c64Sesolom# Use is subject to license terms.
26bd335c64Sesolom#
27bd335c64Sesolom
28*df0345f7SJohn Sonnenscheinrequire 5.8.4;
29bd335c64Sesolomuse strict;
30bd335c64Sesolomuse warnings;
31bd335c64Sesolomuse POSIX;
32bd335c64Sesolomuse File::Basename("basename");
33bd335c64Sesolom
34bd335c64Sesolommy $cmdname = basename($0);
35bd335c64Sesolom
36bd335c64Sesolommy $using_scengen = 0;	# 1 if using scenario simulator
37bd335c64Sesolommy $debug = 0;
38bd335c64Sesolom
399e59f930Sesolommy $normal_sleeptime = 10;		# time to sleep between samples
409e59f930Sesolommy $idle_sleeptime = 45;		# time to sleep when idle
41bd335c64Sesolommy $onecpu_sleeptime = (60 * 15);	# used if only 1 CPU on system
429e59f930Sesolommy $sleeptime = $normal_sleeptime;	# either normal_ or idle_ or onecpu_
43bd335c64Sesolom
449e59f930Sesolommy $idle_intrload = .1;			# idle if interrupt load < 10%
45bd335c64Sesolom
46bd335c64Sesolommy $timerange_toohi    = .01;
47bd335c64Sesolommy $statslen = 60;	# time period (in secs) to keep in @deltas
48bd335c64Sesolom
49bd335c64Sesolom
50bd335c64Sesolom# Parse arguments. intrd does not accept any public arguments; the two
51bd335c64Sesolom# arguments below are meant for testing purposes. -D generates a significant
52bd335c64Sesolom# amount of syslog output. -S <filename> loads the filename as a perl
53bd335c64Sesolom# script. That file is expected to implement a kstat "simulator" which
54bd335c64Sesolom# can be used to feed information to intrd and verify intrd's responses.
55bd335c64Sesolom
56bd335c64Sesolomwhile ($_ = shift @ARGV) {
57bd335c64Sesolom	if ($_ eq "-S" && $#ARGV != -1) {
58bd335c64Sesolom		$using_scengen = 1;
59bd335c64Sesolom		do $ARGV[0];	# load simulator
60bd335c64Sesolom		shift @ARGV;
61bd335c64Sesolom	} elsif ($_ eq "-D") {
62bd335c64Sesolom		$debug = 1;
63bd335c64Sesolom	}
64bd335c64Sesolom}
65bd335c64Sesolom
66bd335c64Sesolomif ($using_scengen == 0) {
67bd335c64Sesolom	require Sun::Solaris::Kstat;
68bd335c64Sesolom	require Sun::Solaris::Intrs;
692917a9c9Sschwartz	import Sun::Solaris::Intrs(qw(intrmove is_pcplusmp));
70bd335c64Sesolom	require Sys::Syslog;
71bd335c64Sesolom	import Sys::Syslog;
72bd335c64Sesolom	openlog($cmdname, 'pid', 'daemon');
73bd335c64Sesolom	setlogmask(Sys::Syslog::LOG_UPTO($debug > 0 ? &Sys::Syslog::LOG_DEBUG :
74bd335c64Sesolom	    &Sys::Syslog::LOG_INFO));
75bd335c64Sesolom}
76bd335c64Sesolom
77bd335c64Sesolommy $asserted = 0;
78bd335c64Sesolommy $assert_level = 'debug';	# syslog level for assertion failures
79bd335c64Sesolomsub VERIFY($@)
80bd335c64Sesolom{
81bd335c64Sesolom	my $bad = (shift() == 0);	# $_[0] == 0 means assert failed
82bd335c64Sesolom	if ($bad) {
83bd335c64Sesolom		my $msg = shift();
84bd335c64Sesolom		syslog($assert_level, "VERIFY: $msg", @_);
85bd335c64Sesolom		$asserted++;
86bd335c64Sesolom	}
87bd335c64Sesolom	return ($bad);
88bd335c64Sesolom}
89bd335c64Sesolom
90bd335c64Sesolom
91bd335c64Sesolom
92bd335c64Sesolom
932917a9c9Sschwartzsub getstat($$);
94bd335c64Sesolomsub generate_delta($$);
95bd335c64Sesolomsub compress_deltas($);
96bd335c64Sesolomsub dumpdelta($);
97bd335c64Sesolom
98bd335c64Sesolomsub goodness($);
99bd335c64Sesolomsub imbalanced($$);
100bd335c64Sesolomsub do_reconfig($);
101bd335c64Sesolom
102bd335c64Sesolomsub goodness_cpu($$);		# private function
103bd335c64Sesolomsub move_intr($$$$);		# private function
104bd335c64Sesolomsub ivecs_to_string(@);		# private function
105bd335c64Sesolomsub do_find_goal($$$$);		# private function
106bd335c64Sesolomsub find_goal($$);		# private function
107bd335c64Sesolomsub do_reconfig_cpu2cpu($$$$);	# private function
108bd335c64Sesolomsub do_reconfig_cpu($$$);	# private function
109bd335c64Sesolom
110bd335c64Sesolom
111bd335c64Sesolom#
112bd335c64Sesolom# What follow are the basic data structures routines of intrd.
113bd335c64Sesolom#
114bd335c64Sesolom# getstat() is responsible for reading the kstats and generating a "stat" hash.
115bd335c64Sesolom#
116bd335c64Sesolom# generate_delta() is responsible for taking two "stat" hashes and creating
117bd335c64Sesolom# a new "delta" hash that represents what has changed over time.
118bd335c64Sesolom#
119bd335c64Sesolom# compress_deltas() is responsible for taking a list of deltas and generating
120bd335c64Sesolom# a single delta hash that encompasses all the time periods described by the
121bd335c64Sesolom# deltas.
122bd335c64Sesolom
123bd335c64Sesolom
124bd335c64Sesolom#
125bd335c64Sesolom# getstat() is handed a reference to a kstat and generates a hash, returned
126bd335c64Sesolom# by reference, containing all the fields from the kstats which we need.
127bd335c64Sesolom# If it returns the scalar 0, it failed to gather the kstats, and the caller
128bd335c64Sesolom# should react accordingly.
129bd335c64Sesolom#
130bd335c64Sesolom# getstat() is also responsible for maintaining a reasonable $sleeptime.
131bd335c64Sesolom#
132bd335c64Sesolom# {"snaptime"}          kstat's snaptime
133bd335c64Sesolom# {<cpuid>}             one hash reference per online cpu
134bd335c64Sesolom#  ->{"tot"}            == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle}
135bd335c64Sesolom#  ->{"crtime"}         == cpu:<cpuid>:sys:crtime
136bd335c64Sesolom#  ->{"ivecs"}
137d89fccd8Sschwartz#     ->{<cookie#>}     iterates over pci_intrs::<nexus>:cookie
138d89fccd8Sschwartz#        ->{"time"}     == pci_intrs:<ivec#>:<nexus>:time (in nsec)
139d89fccd8Sschwartz#        ->{"pil"}      == pci_intrs:<ivec#>:<nexus>:pil
140d89fccd8Sschwartz#        ->{"crtime"}   == pci_intrs:<ivec#>:<nexus>:crtime
141d89fccd8Sschwartz#        ->{"ino"}      == pci_intrs:<ivec#>:<nexus>:ino
1422917a9c9Sschwartz#        ->{"num_ino"}  == num inos of single device instance sharing this entry
1432917a9c9Sschwartz#				Will be > 1 on pcplusmp X86 systems for devices
1442917a9c9Sschwartz#				with multiple MSI interrupts.
145d89fccd8Sschwartz#        ->{"buspath"}  == pci_intrs:<ivec#>:<nexus>:buspath
146d89fccd8Sschwartz#        ->{"name"}     == pci_intrs:<ivec#>:<nexus>:name
147d89fccd8Sschwartz#        ->{"ihs"}      == pci_intrs:<ivec#>:<nexus>:ihs
148bd335c64Sesolom#
149bd335c64Sesolom
1502917a9c9Sschwartzsub getstat($$)
151bd335c64Sesolom{
1522917a9c9Sschwartz	my ($ks, $pcplusmp_sys) = @_;
153bd335c64Sesolom
154bd335c64Sesolom	my $cpucnt = 0;
155bd335c64Sesolom	my %stat = ();
156bd335c64Sesolom	my ($minsnap, $maxsnap);
157bd335c64Sesolom
1582917a9c9Sschwartz	# Hash of hash which matches (MSI device, ino) combos to kstats.
1592917a9c9Sschwartz	my %msidevs = ();
1602917a9c9Sschwartz
161bd335c64Sesolom	# kstats are not generated atomically. Each kstat hierarchy will
162bd335c64Sesolom	# have been generated within the kernel at a different time. On a
163bd335c64Sesolom	# thrashing system, we may not run quickly enough in order to get
164bd335c64Sesolom	# coherent kstat timing information across all the kstats. To
165bd335c64Sesolom	# determine if this is occurring, $minsnap/$maxsnap are used to
166bd335c64Sesolom	# find the breadth between the first and last snaptime of all the
167bd335c64Sesolom	# kstats we access. $maxsnap - $minsnap roughly represents the
168bd335c64Sesolom	# total time taken up in getstat(). If this time approaches the
169bd335c64Sesolom	# time between snapshots, our results may not be useful.
170bd335c64Sesolom
171bd335c64Sesolom	$minsnap = -1;		# snaptime is always a positive number
172bd335c64Sesolom	$maxsnap = $minsnap;
173bd335c64Sesolom
174bd335c64Sesolom	# Iterate over the cpus in cpu:<cpuid>::. Check
175bd335c64Sesolom	# cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the
176bd335c64Sesolom	# processor is "on-line". If not, it isn't accepting interrupts
177bd335c64Sesolom	# and doesn't concern us.
178bd335c64Sesolom	#
179bd335c64Sesolom	# Record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap.
180bd335c64Sesolom
181bd335c64Sesolom	while (my ($cpu, $cpst) = each %{$ks->{cpu}}) {
182bd335c64Sesolom		next if !exists($ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state});
1832917a9c9Sschwartz		#"state" fld of kstat w/
1842917a9c9Sschwartz		#		  modname    inst name-"cpuinfo0"
185bd335c64Sesolom		my $state = $ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state};
186bd335c64Sesolom		next if ($state !~ /^on-line\0/);
187bd335c64Sesolom		my $cpu_sys = $cpst->{sys};
188bd335c64Sesolom
189bd335c64Sesolom		$stat{$cpu}{tot} = ($cpu_sys->{cpu_nsec_idle} +
190bd335c64Sesolom				    $cpu_sys->{cpu_nsec_user} +
191bd335c64Sesolom				    $cpu_sys->{cpu_nsec_kernel});
192bd335c64Sesolom		$stat{$cpu}{crtime} = $cpu_sys->{crtime};
193bd335c64Sesolom		$stat{$cpu}{ivecs} = {};
194bd335c64Sesolom
195bd335c64Sesolom		if ($minsnap == -1 || $cpu_sys->{snaptime} < $minsnap) {
196bd335c64Sesolom			$minsnap = $cpu_sys->{snaptime};
197bd335c64Sesolom		}
198bd335c64Sesolom		if ($cpu_sys->{snaptime} > $maxsnap) {
199bd335c64Sesolom			$maxsnap = $cpu_sys->{snaptime};
200bd335c64Sesolom		}
201bd335c64Sesolom		$cpucnt++;
202bd335c64Sesolom	}
203bd335c64Sesolom
204bd335c64Sesolom	if ($cpucnt <= 1) {
205bd335c64Sesolom		$sleeptime = $onecpu_sleeptime;
206bd335c64Sesolom		return (0);	# nothing to do with 1 CPU
207bd335c64Sesolom	}
208bd335c64Sesolom
209bd335c64Sesolom	# Iterate over the ivecs. If the cpu is not on-line, ignore the
210bd335c64Sesolom	# ivecs mapped to it, if any.
211bd335c64Sesolom	#
212d89fccd8Sschwartz	# Record pci_intrs:{inum}:<nexus>:time, snaptime, crtime, pil,
213bd335c64Sesolom	# ino, name, and buspath. Check $minsnap/$maxsnap.
214bd335c64Sesolom
215bd335c64Sesolom	foreach my $inst (values(%{$ks->{pci_intrs}})) {
216d89fccd8Sschwartz		my $intrcfg = (values(%$inst))[0];
217bd335c64Sesolom		my $cpu = $intrcfg->{cpu};
218bd335c64Sesolom
219bd335c64Sesolom		next unless exists $stat{$cpu};
220e1d9f4e6Sschwartz		next if ($intrcfg->{type} =~ /^disabled\0/);
221bd335c64Sesolom
2222917a9c9Sschwartz		# Perl looks beyond NULL chars in pattern matching.
2232917a9c9Sschwartz		# Truncate name field at the first NULL
2242917a9c9Sschwartz		$intrcfg->{name} =~ s/\0.*$//;
2252917a9c9Sschwartz
226bd335c64Sesolom		if ($intrcfg->{snaptime} < $minsnap) {
227bd335c64Sesolom			$minsnap = $intrcfg->{snaptime};
228bd335c64Sesolom		} elsif ($intrcfg->{snaptime} > $maxsnap) {
229bd335c64Sesolom			$maxsnap = $intrcfg->{snaptime};
230bd335c64Sesolom		}
231bd335c64Sesolom
232bd335c64Sesolom		my $cookie = "$intrcfg->{buspath} $intrcfg->{ino}";
233bd335c64Sesolom		if (exists $stat{$cpu}{ivecs}{$cookie}) {
234bd335c64Sesolom			my $cookiestats = $stat{$cpu}{ivecs}{$cookie};
235bd335c64Sesolom
236bd335c64Sesolom			$cookiestats->{time} += $intrcfg->{time};
237bd335c64Sesolom			$cookiestats->{name} .= "/$intrcfg->{name}";
238bd335c64Sesolom
239bd335c64Sesolom			# If this new interrupt sharing $cookie represents a
240bd335c64Sesolom			# change from an earlier getstat, make sure that
241bd335c64Sesolom			# generate_delta will see the change by setting
242bd335c64Sesolom			# crtime to the most recent crtime of its components.
243bd335c64Sesolom
244bd335c64Sesolom			if ($intrcfg->{crtime} > $cookiestats->{crtime}) {
245bd335c64Sesolom				$cookiestats->{crtime} = $intrcfg->{crtime};
246bd335c64Sesolom			}
247bd335c64Sesolom			$cookiestats->{ihs}++;
248bd335c64Sesolom			next;
249bd335c64Sesolom		}
250bd335c64Sesolom		$stat{$cpu}{ivecs}{$cookie}{time} = $intrcfg->{time};
251bd335c64Sesolom		$stat{$cpu}{ivecs}{$cookie}{crtime} = $intrcfg->{crtime};
252bd335c64Sesolom		$stat{$cpu}{ivecs}{$cookie}{pil} = $intrcfg->{pil};
253bd335c64Sesolom		$stat{$cpu}{ivecs}{$cookie}{ino} = $intrcfg->{ino};
2542917a9c9Sschwartz		$stat{$cpu}{ivecs}{$cookie}{num_ino} = 1;
255bd335c64Sesolom		$stat{$cpu}{ivecs}{$cookie}{buspath} = $intrcfg->{buspath};
256bd335c64Sesolom		$stat{$cpu}{ivecs}{$cookie}{name} = $intrcfg->{name};
257bd335c64Sesolom		$stat{$cpu}{ivecs}{$cookie}{ihs} = 1;
2582917a9c9Sschwartz
2592917a9c9Sschwartz		if ($pcplusmp_sys && ($intrcfg->{type} =~ /^msi\0/)) {
2602917a9c9Sschwartz			if (!(exists($msidevs{$intrcfg->{name}}))) {
2612917a9c9Sschwartz				$msidevs{$intrcfg->{name}} = {};
2622917a9c9Sschwartz			}
2632917a9c9Sschwartz			$msidevs{$intrcfg->{name}}{$intrcfg->{ino}} =
2642917a9c9Sschwartz			    \$stat{$cpu}{ivecs}{$cookie};
2652917a9c9Sschwartz		}
2662917a9c9Sschwartz	}
2672917a9c9Sschwartz
2682917a9c9Sschwartz	# All MSI interrupts of a device instance share a single MSI address.
2692917a9c9Sschwartz	# On X86 systems with an APIC, this MSI address is interpreted as CPU
2702917a9c9Sschwartz	# routing info by the APIC.  For this reason, on these platforms, all
2712917a9c9Sschwartz	# interrupts for MSI devices must be moved to the same CPU at the same
2722917a9c9Sschwartz	# time.
2732917a9c9Sschwartz	#
2742917a9c9Sschwartz	# Since all interrupts will be on the same CPU on these platforms, all
2752917a9c9Sschwartz	# interrupts can be consolidated into one ivec entry.  For such devices,
2762917a9c9Sschwartz	# num_ino will be > 1 to denote that a group move is needed.
2772917a9c9Sschwartz
2782917a9c9Sschwartz	# Loop thru all MSI devices on X86 pcplusmp systems.
2792917a9c9Sschwartz	# Nop on other systems.
2802917a9c9Sschwartz	foreach my $msidevkey (sort keys %msidevs) {
2812917a9c9Sschwartz
2822917a9c9Sschwartz		# Loop thru inos of the device, sorted by lowest value first
2832917a9c9Sschwartz		# For each cookie found for a device, incr num_ino for the
2842917a9c9Sschwartz		# lowest cookie and remove other cookies.
2852917a9c9Sschwartz
2862917a9c9Sschwartz		# Assumes PIL is the same for first and current cookies
2872917a9c9Sschwartz
2882917a9c9Sschwartz		my $first_ino = -1;
2892917a9c9Sschwartz		my $first_cookiep;
2902917a9c9Sschwartz		my $curr_cookiep;
2912917a9c9Sschwartz		foreach my $inokey (sort keys %{$msidevs{$msidevkey}}) {
2922917a9c9Sschwartz			$curr_cookiep = $msidevs{$msidevkey}{$inokey};
2932917a9c9Sschwartz			if ($first_ino == -1) {
2942917a9c9Sschwartz				$first_ino = $inokey;
2952917a9c9Sschwartz				$first_cookiep = $curr_cookiep;
2962917a9c9Sschwartz			} else {
2972917a9c9Sschwartz				$$first_cookiep->{num_ino}++;
2982917a9c9Sschwartz				$$first_cookiep->{time} +=
2992917a9c9Sschwartz				    $$curr_cookiep->{time};
3002917a9c9Sschwartz				if ($$curr_cookiep->{crtime} >
3012917a9c9Sschwartz				    $$first_cookiep->{crtime}) {
3022917a9c9Sschwartz					$$first_cookiep->{crtime} =
3032917a9c9Sschwartz					    $$curr_cookiep->{crtime};
3042917a9c9Sschwartz				}
3052917a9c9Sschwartz				# Invalidate this cookie, less complicated and
3062917a9c9Sschwartz				# more efficient than deleting it.
3072917a9c9Sschwartz				$$curr_cookiep->{num_ino} = 0;
3082917a9c9Sschwartz			}
3092917a9c9Sschwartz		}
310bd335c64Sesolom	}
311bd335c64Sesolom
312bd335c64Sesolom	# We define the timerange as the amount of time spent gathering the
313bd335c64Sesolom	# various kstats, divided by our sleeptime. If we take a lot of time
314bd335c64Sesolom	# to access the kstats, and then we create a delta comparing these
315bd335c64Sesolom	# kstats with a prior set of kstats, that delta will cover
316bd335c64Sesolom	# substaintially different amount of time depending upon which
317bd335c64Sesolom	# interrupt or CPU is being examined.
318bd335c64Sesolom	#
319bd335c64Sesolom	# By checking the timerange here, we guarantee that any deltas
320bd335c64Sesolom	# created from these kstats will contain self-consistent data,
321bd335c64Sesolom	# in that all CPUs and interrupts cover a similar span of time.
322bd335c64Sesolom	#
3239e59f930Sesolom	# $timerange_toohi is the upper bound. Any timerange above
324bd335c64Sesolom	# this is thrown out as garbage. If the stat is safely within this
325bd335c64Sesolom	# bound, we treat the stat as representing an instant in time, rather
326bd335c64Sesolom	# than the time range it actually spans. We arbitrarily choose minsnap
327bd335c64Sesolom	# as the snaptime of the stat.
328bd335c64Sesolom
329bd335c64Sesolom	$stat{snaptime} = $minsnap;
330bd335c64Sesolom	my $timerange = ($maxsnap - $minsnap) / $sleeptime;
331bd335c64Sesolom	return (0) if ($timerange > $timerange_toohi);	# i.e. failure
332bd335c64Sesolom	return (\%stat);
333bd335c64Sesolom}
334bd335c64Sesolom
335bd335c64Sesolom#
336bd335c64Sesolom# dumpdelta takes a reference to our "delta" structure:
337bd335c64Sesolom# {"missing"}           "1" if the delta's component stats had inconsistencies
338bd335c64Sesolom# {"minsnap"}           time of the first kstat snaptime used in this delta
339bd335c64Sesolom# {"maxsnap"}           time of the last kstat snaptime used in this delta
340bd335c64Sesolom# {"goodness"}          cost function applied to this delta
341bd335c64Sesolom# {"avgintrload"}       avg of interrupt load across cpus, as a percentage
342bd335c64Sesolom# {"avgintrnsec"}       avg number of nsec spent in interrupts, per cpu
343bd335c64Sesolom# {<cpuid>}             iterates over on-line cpus
344bd335c64Sesolom#  ->{"intrs"}          cpu's movable intr time (sum of "time" for each ivec)
3459e59f930Sesolom#  ->{"tot"}            CPU load from all sources in nsec
346bd335c64Sesolom#  ->{"bigintr"}        largest value of {ivecs}{<ivec#>}{time} from below
347bd335c64Sesolom#  ->{"intrload"}       intrs / tot
348bd335c64Sesolom#  ->{"ivecs"}
349bd335c64Sesolom#     ->{<ivec#>}       iterates over ivecs for this cpu
350bd335c64Sesolom#        ->{"time"}     time used by this interrupt (in nsec)
351bd335c64Sesolom#        ->{"pil"}      pil level of this interrupt
3522917a9c9Sschwartz#        ->{"ino"}      interrupt number (or base vector if MSI group)
353bd335c64Sesolom#        ->{"buspath"}  filename of the directory of the device's bus
354bd335c64Sesolom#        ->{"name"}     device name
355bd335c64Sesolom#        ->{"ihs"}      number of different handlers sharing this ino
3562917a9c9Sschwartz#        ->{"num_ino"}  number of interrupt vectors in MSI group
357bd335c64Sesolom#
358bd335c64Sesolom# It prints out the delta structure in a nice, human readable display.
359bd335c64Sesolom#
360bd335c64Sesolom
361bd335c64Sesolomsub dumpdelta($)
362bd335c64Sesolom{
363bd335c64Sesolom	my ($delta) = @_;
364bd335c64Sesolom
365bd335c64Sesolom	# print global info
366bd335c64Sesolom
367bd335c64Sesolom	syslog('debug', "dumpdelta:");
368bd335c64Sesolom	syslog('debug', " RECONFIGURATION IN DELTA") if $delta->{missing} > 0;
369bd335c64Sesolom	syslog('debug', " avgintrload: %5.2f%%  avgintrnsec: %d",
370bd335c64Sesolom	       $delta->{avgintrload} * 100, $delta->{avgintrnsec});
371bd335c64Sesolom	syslog('debug', "    goodness: %5.2f%%", $delta->{goodness} * 100)
372bd335c64Sesolom	    if exists($delta->{goodness});
373bd335c64Sesolom
374bd335c64Sesolom	# iterate over cpus
375bd335c64Sesolom
376bd335c64Sesolom	while (my ($cpu, $cpst) = each %$delta) {
377bd335c64Sesolom		next if !ref($cpst);		# skip non-cpuid entries
378bd335c64Sesolom		my $tot = $cpst->{tot};
379bd335c64Sesolom		syslog('debug', "    cpu %3d intr %7.3f%%  (bigintr %7.3f%%)",
380bd335c64Sesolom		       $cpu, $cpst->{intrload}*100, $cpst->{bigintr}*100/$tot);
381bd335c64Sesolom		syslog('debug', "        intrs %d, bigintr %d",
382bd335c64Sesolom		       $cpst->{intrs}, $cpst->{bigintr});
383bd335c64Sesolom
384bd335c64Sesolom		# iterate over ivecs on this cpu
385bd335c64Sesolom
386bd335c64Sesolom		while (my ($ivec, $ivst) = each %{$cpst->{ivecs}}) {
3875bb4956eSesolom			syslog('debug', "    %15s:\"%s\": %7.3f%%  %d",
3885bb4956eSesolom			    ($ivst->{ihs} > 1 ? "$ivst->{name}($ivst->{ihs})" :
3895bb4956eSesolom			    $ivst->{name}), $ivec,
3905bb4956eSesolom			    $ivst->{time}*100 / $tot, $ivst->{time});
391bd335c64Sesolom		}
392bd335c64Sesolom	}
393bd335c64Sesolom}
394bd335c64Sesolom
395bd335c64Sesolom#
396bd335c64Sesolom# generate_delta($stat, $newstat) takes two stat references, returned from
397bd335c64Sesolom# getstat(), and creates a %delta. %delta (not surprisingly) contains the
398bd335c64Sesolom# same basic info as stat and newstat, but with the timestamps as deltas
399bd335c64Sesolom# instead of absolute times. We return a reference to the delta.
400bd335c64Sesolom#
401bd335c64Sesolom
402bd335c64Sesolomsub generate_delta($$)
403bd335c64Sesolom{
404bd335c64Sesolom	my ($stat, $newstat) = @_;
405bd335c64Sesolom
406bd335c64Sesolom	my %delta = ();
407bd335c64Sesolom	my $intrload;
408bd335c64Sesolom	my $intrnsec;
409bd335c64Sesolom	my $cpus;
410bd335c64Sesolom
411bd335c64Sesolom	# Take the worstcase timerange
412bd335c64Sesolom	$delta{minsnap} = $stat->{snaptime};
413bd335c64Sesolom	$delta{maxsnap} = $newstat->{snaptime};
414bd335c64Sesolom	if (VERIFY($delta{maxsnap} > $delta{minsnap},
415bd335c64Sesolom	    "generate_delta: stats aren't ascending")) {
416bd335c64Sesolom		$delta{missing} = 1;
417bd335c64Sesolom		return (\%delta);
418bd335c64Sesolom	}
419bd335c64Sesolom
420bd335c64Sesolom	# if there are a different number of cpus in the stats, set missing
421bd335c64Sesolom
422bd335c64Sesolom	$delta{missing} = (keys(%$stat) != keys(%$newstat));
423bd335c64Sesolom	if (VERIFY($delta{missing} == 0,
424bd335c64Sesolom	    "generate_delta: number of CPUs changed")) {
425bd335c64Sesolom		return (\%delta);
426bd335c64Sesolom	}
427bd335c64Sesolom
428bd335c64Sesolom	# scan through every cpu in %newstat and compare against %stat
429bd335c64Sesolom
430bd335c64Sesolom	while (my ($cpu, $newcpst) = each %$newstat) {
431bd335c64Sesolom		next if !ref($newcpst);		# skip non-cpuid fields
432bd335c64Sesolom
433bd335c64Sesolom		# If %stat is missing a cpu from %newstat, then it was just
434bd335c64Sesolom		# onlined. Mark missing.
435bd335c64Sesolom
436bd335c64Sesolom		if (VERIFY(exists $stat->{$cpu} &&
437bd335c64Sesolom		    $stat->{$cpu}{crtime} == $newcpst->{crtime},
438bd335c64Sesolom		    "generate_delta: cpu $cpu changed")) {
439bd335c64Sesolom			$delta{missing} = 1;
440bd335c64Sesolom			return (\%delta);
441bd335c64Sesolom		}
442bd335c64Sesolom		my $cpst = $stat->{$cpu};
443bd335c64Sesolom		$delta{$cpu}{tot} = $newcpst->{tot} - $cpst->{tot};
444bd335c64Sesolom		if (VERIFY($delta{$cpu}{tot} >= 0,
445bd335c64Sesolom		    "generate_delta: deltas are not ascending?")) {
446bd335c64Sesolom			$delta{missing} = 1;
447bd335c64Sesolom			delete($delta{$cpu});
448bd335c64Sesolom			return (\%delta);
449bd335c64Sesolom		}
450bd335c64Sesolom		# Avoid remote chance of division by zero
451bd335c64Sesolom		$delta{$cpu}{tot} = 1 if $delta{$cpu}{tot} == 0;
452bd335c64Sesolom		$delta{$cpu}{intrs} = 0;
453bd335c64Sesolom		$delta{$cpu}{bigintr} = 0;
454bd335c64Sesolom
455bd335c64Sesolom		my %ivecs = ();
456bd335c64Sesolom		$delta{$cpu}{ivecs} = \%ivecs;
457bd335c64Sesolom
458bd335c64Sesolom		# if the number of ivecs differs, set missing
459bd335c64Sesolom
460bd335c64Sesolom		if (VERIFY(keys(%{$cpst->{ivecs}}) ==
461bd335c64Sesolom			   keys(%{$newcpst->{ivecs}}),
462bd335c64Sesolom			   "generate_delta: cpu $cpu has more/less".
463bd335c64Sesolom			   " interrupts")) {
464bd335c64Sesolom			$delta{missing} = 1;
465bd335c64Sesolom			return (\%delta);
466bd335c64Sesolom		}
467bd335c64Sesolom
468bd335c64Sesolom		while (my ($inum, $newivec) = each %{$newcpst->{ivecs}}) {
4692917a9c9Sschwartz
4702917a9c9Sschwartz			# Unused cookie, corresponding to an MSI vector which
4712917a9c9Sschwartz			# is part of a group.  The whole group is accounted for
4722917a9c9Sschwartz			# by a different cookie.
4732917a9c9Sschwartz			next if ($newivec->{num_ino} == 0);
4742917a9c9Sschwartz
475bd335c64Sesolom			# If this ivec doesn't exist in $stat, or if $stat
476bd335c64Sesolom			# shows a different crtime, set missing.
477bd335c64Sesolom			if (VERIFY(exists $cpst->{ivecs}{$inum} &&
478bd335c64Sesolom				   $cpst->{ivecs}{$inum}{crtime} ==
479bd335c64Sesolom				   $newivec->{crtime},
480bd335c64Sesolom				   "generate_delta: cpu $cpu inum $inum".
481bd335c64Sesolom				   " has changed")) {
482bd335c64Sesolom				$delta{missing} = 1;
483bd335c64Sesolom				return (\%delta);
484bd335c64Sesolom			}
485bd335c64Sesolom			my $ivec = $cpst->{ivecs}{$inum};
486bd335c64Sesolom
487bd335c64Sesolom			# Create $delta{$cpu}{ivecs}{$inum}.
488bd335c64Sesolom
489bd335c64Sesolom			my %dltivec = ();
490bd335c64Sesolom			$delta{$cpu}{ivecs}{$inum} = \%dltivec;
491bd335c64Sesolom
492bd335c64Sesolom			# calculate time used by this interrupt
493bd335c64Sesolom
494bd335c64Sesolom			my $time = $newivec->{time} - $ivec->{time};
495bd335c64Sesolom			if (VERIFY($time >= 0,
496bd335c64Sesolom				   "generate_delta: ivec went backwards?")) {
497bd335c64Sesolom				$delta{missing} = 1;
498bd335c64Sesolom				delete($delta{$cpu}{ivecs}{$inum});
499bd335c64Sesolom				return (\%delta);
500bd335c64Sesolom			}
501bd335c64Sesolom			$delta{$cpu}{intrs} += $time;
502bd335c64Sesolom			$dltivec{time} = $time;
503bd335c64Sesolom			if ($time > $delta{$cpu}{bigintr}) {
504bd335c64Sesolom				$delta{$cpu}{bigintr} = $time;
505bd335c64Sesolom			}
506bd335c64Sesolom
507bd335c64Sesolom			# Transfer over basic info about the kstat. We
508bd335c64Sesolom			# don't have to worry about discrepancies between
509bd335c64Sesolom			# ivec and newivec because we verified that both
510bd335c64Sesolom			# have the same crtime.
511bd335c64Sesolom
512bd335c64Sesolom			$dltivec{pil} = $newivec->{pil};
513bd335c64Sesolom			$dltivec{ino} = $newivec->{ino};
514bd335c64Sesolom			$dltivec{buspath} = $newivec->{buspath};
515bd335c64Sesolom			$dltivec{name} = $newivec->{name};
516bd335c64Sesolom			$dltivec{ihs} = $newivec->{ihs};
5172917a9c9Sschwartz			$dltivec{num_ino} = $newivec->{num_ino};
518bd335c64Sesolom		}
519bd335c64Sesolom		if ($delta{$cpu}{tot} < $delta{$cpu}{intrs}) {
520bd335c64Sesolom			# Ewww! Hopefully just a rounding error.
521bd335c64Sesolom			# Make something up.
522bd335c64Sesolom			$delta{$cpu}{tot} = $delta{$cpu}{intrs};
523bd335c64Sesolom		}
524bd335c64Sesolom		$delta{$cpu}{intrload} =
525bd335c64Sesolom		       $delta{$cpu}{intrs} / $delta{$cpu}{tot};
526bd335c64Sesolom		$intrload += $delta{$cpu}{intrload};
527bd335c64Sesolom		$intrnsec += $delta{$cpu}{intrs};
528bd335c64Sesolom		$cpus++;
529bd335c64Sesolom	}
530bd335c64Sesolom	if ($cpus > 0) {
531bd335c64Sesolom		$delta{avgintrload} = $intrload / $cpus;
532bd335c64Sesolom		$delta{avgintrnsec} = $intrnsec / $cpus;
533bd335c64Sesolom	} else {
534bd335c64Sesolom		$delta{avgintrload} = 0;
535bd335c64Sesolom		$delta{avgintrnsec} = 0;
536bd335c64Sesolom	}
537bd335c64Sesolom	return (\%delta);
538bd335c64Sesolom}
539bd335c64Sesolom
540bd335c64Sesolom
541bd335c64Sesolom# compress_delta takes a list of deltas, and returns a single new delta
542bd335c64Sesolom# which represents the combined information from all the deltas. The deltas
543bd335c64Sesolom# provided are assumed to be sequential in time. The resulting compressed
544bd335c64Sesolom# delta looks just like any other delta. This new delta is also more accurate
545bd335c64Sesolom# since its statistics are averaged over a longer period than any of the
546bd335c64Sesolom# original deltas.
547bd335c64Sesolom
548bd335c64Sesolomsub compress_deltas ($)
549bd335c64Sesolom{
550bd335c64Sesolom	my ($deltas) = @_;
551bd335c64Sesolom
552bd335c64Sesolom	my %newdelta = ();
553bd335c64Sesolom	my ($intrs, $tot);
554bd335c64Sesolom	my $cpus = 0;
5559e59f930Sesolom	my ($high_intrload) = 0;
556bd335c64Sesolom
557bd335c64Sesolom	if (VERIFY($#$deltas != -1,
558bd335c64Sesolom		   "compress_deltas: list of delta is empty?")) {
559bd335c64Sesolom		return (0);
560bd335c64Sesolom	}
561bd335c64Sesolom	$newdelta{minsnap} = $deltas->[0]{minsnap};
562bd335c64Sesolom	$newdelta{maxsnap} = $deltas->[$#$deltas]{maxsnap};
563bd335c64Sesolom	$newdelta{missing} = 0;
564bd335c64Sesolom
565bd335c64Sesolom	foreach my $delta (@$deltas) {
566bd335c64Sesolom		if (VERIFY($delta->{missing} == 0,
567bd335c64Sesolom		    "compressing bad deltas?")) {
568bd335c64Sesolom			return (0);
569bd335c64Sesolom		}
570bd335c64Sesolom		while (my ($cpuid, $cpu) = each %$delta) {
571bd335c64Sesolom			next if !ref($cpu);
572bd335c64Sesolom
573bd335c64Sesolom			$intrs += $cpu->{intrs};
574bd335c64Sesolom			$tot += $cpu->{tot};
575bd335c64Sesolom			$newdelta{$cpuid}{intrs} += $cpu->{intrs};
576bd335c64Sesolom			$newdelta{$cpuid}{tot} += $cpu->{tot};
577bd335c64Sesolom			if (!exists $newdelta{$cpuid}{ivecs}) {
578bd335c64Sesolom				my %ivecs = ();
579bd335c64Sesolom				$newdelta{$cpuid}{ivecs} = \%ivecs;
580bd335c64Sesolom			}
581bd335c64Sesolom			while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
582bd335c64Sesolom				my $newivecs = $newdelta{$cpuid}{ivecs};
583bd335c64Sesolom				$newivecs->{$inum}{time} += $ivec->{time};
584bd335c64Sesolom				$newivecs->{$inum}{pil} = $ivec->{pil};
585bd335c64Sesolom				$newivecs->{$inum}{ino} = $ivec->{ino};
586bd335c64Sesolom				$newivecs->{$inum}{buspath} = $ivec->{buspath};
587bd335c64Sesolom				$newivecs->{$inum}{name} = $ivec->{name};
588bd335c64Sesolom				$newivecs->{$inum}{ihs} = $ivec->{ihs};
5892917a9c9Sschwartz				$newivecs->{$inum}{num_ino} = $ivec->{num_ino};
590bd335c64Sesolom			}
591bd335c64Sesolom		}
592bd335c64Sesolom	}
593bd335c64Sesolom	foreach my $cpu (values(%newdelta)) {
594bd335c64Sesolom		next if !ref($cpu); # ignore non-cpu fields
595bd335c64Sesolom		$cpus++;
596bd335c64Sesolom
597bd335c64Sesolom		my $bigintr = 0;
598bd335c64Sesolom		foreach my $ivec (values(%{$cpu->{ivecs}})) {
599bd335c64Sesolom			if ($ivec->{time} > $bigintr) {
600bd335c64Sesolom				$bigintr = $ivec->{time};
601bd335c64Sesolom			}
602bd335c64Sesolom		}
603bd335c64Sesolom		$cpu->{bigintr} = $bigintr;
604bd335c64Sesolom		$cpu->{intrload} = $cpu->{intrs} / $cpu->{tot};
6059e59f930Sesolom		if ($high_intrload < $cpu->{intrload}) {
6069e59f930Sesolom			$high_intrload = $cpu->{intrload};
6079e59f930Sesolom		}
608bd335c64Sesolom		$cpu->{tot} = 1 if $cpu->{tot} <= 0;
609bd335c64Sesolom	}
610bd335c64Sesolom	if ($cpus == 0) {
611bd335c64Sesolom		$newdelta{avgintrnsec} = 0;
612bd335c64Sesolom		$newdelta{avgintrload} = 0;
613bd335c64Sesolom	} else {
614bd335c64Sesolom		$newdelta{avgintrnsec} = $intrs / $cpus;
615bd335c64Sesolom		$newdelta{avgintrload} = $intrs / $tot;
616bd335c64Sesolom	}
6179e59f930Sesolom	$sleeptime = ($high_intrload < $idle_intrload) ? $idle_sleeptime :
6189e59f930Sesolom	    $normal_sleeptime;
619bd335c64Sesolom	return (\%newdelta);
620bd335c64Sesolom}
621bd335c64Sesolom
622bd335c64Sesolom
623bd335c64Sesolom
624bd335c64Sesolom
625bd335c64Sesolom
626bd335c64Sesolom# What follow are the core functions responsible for examining the deltas
627bd335c64Sesolom# generated above and deciding what to do about them.
628bd335c64Sesolom#
629bd335c64Sesolom# goodness() and its helper goodness_cpu() return a heuristic which describe
630bd335c64Sesolom# how good (or bad) the current interrupt balance is. The value returned will
631bd335c64Sesolom# be between 0 and 1, with 0 representing maximum goodness, and 1 representing
632bd335c64Sesolom# maximum badness.
633bd335c64Sesolom#
634bd335c64Sesolom# imbalanced() compares a current and historical value of goodness, and
635bd335c64Sesolom# determines if there has been enough change to warrant evaluating a
636bd335c64Sesolom# reconfiguration of the interrupts
637bd335c64Sesolom#
638bd335c64Sesolom# do_reconfig(), and its helpers, do_reconfig_cpu(), do_reconfig_cpu2cpu(),
639bd335c64Sesolom# find_goal(), do_find_goal(), and move_intr(), are responsible for examining
640bd335c64Sesolom# a delta and determining the best possible assignment of interrupts to CPUs.
641bd335c64Sesolom#
642bd335c64Sesolom# It is important that do_reconfig() be in alignment with goodness(). If
643bd335c64Sesolom# do_reconfig were to generate a new interrupt distribution that worsened
644bd335c64Sesolom# goodness, we could get into a pathological loop with intrd fighting itself,
645bd335c64Sesolom# constantly deciding that things are imbalanced, and then changing things
646bd335c64Sesolom# only to make them worse.
647bd335c64Sesolom
648bd335c64Sesolom
649bd335c64Sesolom
650bd335c64Sesolom# any goodness over $goodness_unsafe_load is considered really bad
651bd335c64Sesolom# goodness must drop by at least $goodness_mindelta for a reconfig
652bd335c64Sesolom
653bd335c64Sesolommy $goodness_unsafe_load = .9;
654bd335c64Sesolommy $goodness_mindelta = .1;
655bd335c64Sesolom
656bd335c64Sesolom# goodness(%delta) examines a delta and return its "goodness". goodness will
657bd335c64Sesolom# be between 0 (best) and 1 (major bad). goodness is determined by evaluating
658bd335c64Sesolom# the goodness of each individual cpu, and returning the worst case. This
659bd335c64Sesolom# helps on systems with many CPUs, where otherwise a single pathological CPU
660bd335c64Sesolom# might otherwise be ignored because the average was OK.
661bd335c64Sesolom#
662bd335c64Sesolom# To calculate the goodness of an individual CPU, we start by looking at its
663bd335c64Sesolom# load due to interrupts. If the load is above a certain high threshold and
664bd335c64Sesolom# there is more than one interrupt assigned to this CPU, we set goodness
665bd335c64Sesolom# to worst-case. If the load is below the average interrupt load of all CPUs,
666bd335c64Sesolom# then we return best-case, since what's to complain about?
667bd335c64Sesolom#
668bd335c64Sesolom# Otherwise we look at how much the load is above the average, and return
669bd335c64Sesolom# that as the goodness, with one caveat: we never return more than the CPU's
670bd335c64Sesolom# interrupt load ignoring its largest single interrupt source. This is
671bd335c64Sesolom# because a CPU with one high-load interrupt, and no other interrupts, is
672bd335c64Sesolom# perfectly balanced. Nothing can be done to improve the situation, and thus
673bd335c64Sesolom# it is perfectly balanced even if the interrupt's load is 100%.
674bd335c64Sesolom
675bd335c64Sesolomsub goodness($)
676bd335c64Sesolom{
677bd335c64Sesolom	my ($delta) = @_;
678bd335c64Sesolom
679bd335c64Sesolom	return (1) if $delta->{missing} > 0;
680bd335c64Sesolom
681bd335c64Sesolom	my $high_goodness = 0;
682bd335c64Sesolom	my $goodness;
683bd335c64Sesolom
684bd335c64Sesolom	foreach my $cpu (values(%$delta)) {
685bd335c64Sesolom		next if !ref($cpu);		# skip non-cpuid fields
686bd335c64Sesolom
687bd335c64Sesolom		$goodness = goodness_cpu($cpu, $delta->{avgintrload});
688bd335c64Sesolom		if (VERIFY($goodness >= 0 && $goodness <= 1,
689bd335c64Sesolom			   "goodness: cpu goodness out of range?")) {
690bd335c64Sesolom			dumpdelta($delta);
691bd335c64Sesolom			return (1);
692bd335c64Sesolom		}
693bd335c64Sesolom		if ($goodness == 1) {
694bd335c64Sesolom			return (1);	# worst case, no need to continue
695bd335c64Sesolom		}
696bd335c64Sesolom		if ($goodness > $high_goodness) {
697bd335c64Sesolom			$high_goodness = $goodness;
698bd335c64Sesolom		}
699bd335c64Sesolom	}
700bd335c64Sesolom	return ($high_goodness);
701bd335c64Sesolom}
702bd335c64Sesolom
703bd335c64Sesolomsub goodness_cpu($$)		# private function
704bd335c64Sesolom{
705bd335c64Sesolom	my ($cpu, $avgintrload) = @_;
706bd335c64Sesolom
707bd335c64Sesolom	my $goodness;
708bd335c64Sesolom	my $load = $cpu->{intrs} / $cpu->{tot};
709bd335c64Sesolom
710bd335c64Sesolom	return (0) if ($load < $avgintrload);	# low loads are perfectly good
711bd335c64Sesolom
712bd335c64Sesolom	# Calculate $load_no_bigintr, which represents the load
713bd335c64Sesolom	# due to interrupts, excluding the one biggest interrupt.
714bd335c64Sesolom	# This is the most gain we can get on this CPU from
715bd335c64Sesolom	# offloading interrupts.
716bd335c64Sesolom
717bd335c64Sesolom	my $load_no_bigintr = ($cpu->{intrs} - $cpu->{bigintr}) / $cpu->{tot};
718bd335c64Sesolom
719bd335c64Sesolom	# A major imbalance is indicated if a CPU is saturated
720bd335c64Sesolom	# with interrupt handling, and it has more than one
721bd335c64Sesolom	# source of interrupts. Those other interrupts could be
722bd335c64Sesolom	# starved if of a lower pil. Return a goodness of 1,
723bd335c64Sesolom	# which is the worst possible return value,
724bd335c64Sesolom	# which will effectively contaminate this entire delta.
725bd335c64Sesolom
726bd335c64Sesolom	my $cnt = keys(%{$cpu->{ivecs}});
727bd335c64Sesolom
728bd335c64Sesolom	if ($load > $goodness_unsafe_load && $cnt > 1) {
729bd335c64Sesolom		return (1);
730bd335c64Sesolom	}
731bd335c64Sesolom	$goodness = $load - $avgintrload;
732bd335c64Sesolom	if ($goodness > $load_no_bigintr) {
733bd335c64Sesolom		$goodness = $load_no_bigintr;
734bd335c64Sesolom	}
735bd335c64Sesolom	return ($goodness);
736bd335c64Sesolom}
737bd335c64Sesolom
738bd335c64Sesolom
739bd335c64Sesolom# imbalanced() is used by the main routine to determine if the goodness
740bd335c64Sesolom# has shifted far enough from our last baseline to warrant a reassignment
741bd335c64Sesolom# of interrupts. A very high goodness indicates that a CPU is way out of
742bd335c64Sesolom# whack. If the goodness has varied too much since the baseline, then
743bd335c64Sesolom# perhaps a reconfiguration is worth considering.
744bd335c64Sesolom
745bd335c64Sesolomsub imbalanced ($$)
746bd335c64Sesolom{
747bd335c64Sesolom	my ($goodness, $baseline) = @_;
748bd335c64Sesolom
749bd335c64Sesolom	# Return 1 if we are pathological, or creeping away from the baseline
750bd335c64Sesolom
751bd335c64Sesolom	return (1) if $goodness > .50;
752bd335c64Sesolom	return (1) if abs($goodness - $baseline) > $goodness_mindelta;
753bd335c64Sesolom	return (0);
754bd335c64Sesolom}
755bd335c64Sesolom
756bd335c64Sesolom# do_reconfig(), do_reconfig_cpu(), and do_reconfig_cpu2cpu(), are the
757bd335c64Sesolom# decision-making functions responsible for generating a new interrupt
758bd335c64Sesolom# distribution. They are designed with the definition of goodness() in
759bd335c64Sesolom# mind, i.e. they use the same definition of "good distribution" as does
760bd335c64Sesolom# goodness().
761bd335c64Sesolom#
762bd335c64Sesolom# do_reconfig() is responsible for deciding whether a redistribution is
763bd335c64Sesolom# actually warranted. If the goodness is already pretty good, it doesn't
764bd335c64Sesolom# waste the CPU time to generate a new distribution. If it
765bd335c64Sesolom# calculates a new distribution and finds that it is not sufficiently
766bd335c64Sesolom# improved from the prior distirbution, it will not do the redistribution,
767bd335c64Sesolom# mainly to avoid the disruption to system performance caused by
768bd335c64Sesolom# rejuggling interrupts.
769bd335c64Sesolom#
770bd335c64Sesolom# Its main loop works by going through a list of cpus sorted from
771bd335c64Sesolom# highest to lowest interrupt load. It removes the highest-load cpus
772bd335c64Sesolom# one at a time and hands them off to do_reconfig_cpu(). This function
773bd335c64Sesolom# then re-sorts the remaining CPUs from lowest to highest interrupt load,
774bd335c64Sesolom# and one at a time attempts to rejuggle interrupts between the original
775bd335c64Sesolom# high-load CPU and the low-load CPU. Rejuggling on a high-load CPU is
776bd335c64Sesolom# considered finished as soon as its interrupt load is within
777bd335c64Sesolom# $goodness_mindelta of the average interrupt load. Such a CPU will have
778bd335c64Sesolom# a goodness of below the $goodness_mindelta threshold.
779bd335c64Sesolom
780bd335c64Sesolom#
781bd335c64Sesolom# move_intr(\%delta, $inum, $oldcpu, $newcpu)
782bd335c64Sesolom# used by reconfiguration code to move an interrupt between cpus within
783bd335c64Sesolom# a delta. This manipulates data structures, and does not actually move
784bd335c64Sesolom# the interrupt on the running system.
785bd335c64Sesolom#
786bd335c64Sesolomsub move_intr($$$$)		# private function
787bd335c64Sesolom{
788bd335c64Sesolom	my ($delta, $inum, $oldcpuid, $newcpuid) = @_;
789bd335c64Sesolom
790bd335c64Sesolom	my $ivec = $delta->{$oldcpuid}{ivecs}{$inum};
791bd335c64Sesolom
792bd335c64Sesolom	# Remove ivec from old cpu
793bd335c64Sesolom
794bd335c64Sesolom	my $oldcpu = $delta->{$oldcpuid};
795bd335c64Sesolom	$oldcpu->{intrs} -= $ivec->{time};
796bd335c64Sesolom	$oldcpu->{intrload} = $oldcpu->{intrs} / $oldcpu->{tot};
797bd335c64Sesolom	delete($oldcpu->{ivecs}{$inum});
798bd335c64Sesolom
799bd335c64Sesolom	VERIFY($oldcpu->{intrs} >= 0, "move_intr: intr's time > total time?");
800bd335c64Sesolom	VERIFY($ivec->{time} <= $oldcpu->{bigintr},
801bd335c64Sesolom	       "move_intr: intr's time > bigintr?");
802bd335c64Sesolom
803bd335c64Sesolom	if ($ivec->{time} >= $oldcpu->{bigintr}) {
804bd335c64Sesolom		my $bigtime = 0;
805bd335c64Sesolom
806bd335c64Sesolom		foreach my $ivec (values(%{$oldcpu->{ivecs}})) {
807bd335c64Sesolom			$bigtime = $ivec->{time} if $ivec->{time} > $bigtime;
808bd335c64Sesolom		}
809bd335c64Sesolom		$oldcpu->{bigintr} = $bigtime;
810bd335c64Sesolom	}
811bd335c64Sesolom
812bd335c64Sesolom	# Add ivec onto new cpu
813bd335c64Sesolom
814bd335c64Sesolom	my $newcpu = $delta->{$newcpuid};
815bd335c64Sesolom
816bd335c64Sesolom	$ivec->{nowcpu} = $newcpuid;
817bd335c64Sesolom	$newcpu->{intrs} += $ivec->{time};
818bd335c64Sesolom	$newcpu->{intrload} = $newcpu->{intrs} / $newcpu->{tot};
819bd335c64Sesolom	$newcpu->{ivecs}{$inum} = $ivec;
820bd335c64Sesolom
821bd335c64Sesolom	$newcpu->{bigintr} = $ivec->{time}
822bd335c64Sesolom		if $ivec->{time} > $newcpu->{bigintr};
823bd335c64Sesolom}
824bd335c64Sesolom
825bd335c64Sesolomsub move_intr_check($$$)	# private function
826bd335c64Sesolom{
827bd335c64Sesolom	my ($delta, $oldcpuid, $newcpuid) = @_;
828bd335c64Sesolom
829bd335c64Sesolom	VERIFY($delta->{$oldcpuid}{tot} >= $delta->{$oldcpuid}{intrs},
830bd335c64Sesolom	       "Moved interrupts left 100+%% load on src cpu");
831bd335c64Sesolom	VERIFY($delta->{$newcpuid}{tot} >= $delta->{$newcpuid}{intrs},
832bd335c64Sesolom	       "Moved interrupts left 100+%% load on tgt cpu");
833bd335c64Sesolom}
834bd335c64Sesolom
835bd335c64Sesolomsub ivecs_to_string(@)		# private function
836bd335c64Sesolom{
837bd335c64Sesolom	my $str = "";
838bd335c64Sesolom	foreach my $ivec (@_) {
839bd335c64Sesolom		$str = "$str $ivec->{inum}";
840bd335c64Sesolom	}
841bd335c64Sesolom	return ($str);
842bd335c64Sesolom}
843bd335c64Sesolom
844bd335c64Sesolom
845bd335c64Sesolomsub do_reconfig($)
846bd335c64Sesolom{
847bd335c64Sesolom	my ($delta) = @_;
848bd335c64Sesolom
849bd335c64Sesolom	my $goodness = $delta->{goodness};
850bd335c64Sesolom
851bd335c64Sesolom	# We can't improve goodness to better than 0. We should stop here
852bd335c64Sesolom	# if, even if we achieve a goodness of 0, the improvement is still
853bd335c64Sesolom	# too small to merit the action.
854bd335c64Sesolom
855bd335c64Sesolom	if ($goodness - 0 < $goodness_mindelta) {
856bd335c64Sesolom		syslog('debug', "goodness good enough, don't reconfig");
857bd335c64Sesolom		return (0);
858bd335c64Sesolom	}
859bd335c64Sesolom
860bd335c64Sesolom	syslog('notice', "Optimizing interrupt assignments");
861bd335c64Sesolom
862bd335c64Sesolom	if (VERIFY ($delta->{missing} == 0, "RECONFIG Aborted: should not ".
863bd335c64Sesolom	    "have a delta with missing")) {
864bd335c64Sesolom		return (-1);
865bd335c64Sesolom	}
866bd335c64Sesolom
867bd335c64Sesolom	# Make a list of all cpuids, and also add some extra information
868bd335c64Sesolom	# to the ivec structures.
869bd335c64Sesolom
870bd335c64Sesolom	my @cpusortlist = ();
871bd335c64Sesolom
872bd335c64Sesolom	while (my ($cpuid, $cpu) = each %$delta) {
873bd335c64Sesolom		next if !ref($cpu);	# skip non-cpu entries
874bd335c64Sesolom
875bd335c64Sesolom		push(@cpusortlist, $cpuid);
876bd335c64Sesolom		while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
877bd335c64Sesolom			$ivec->{origcpu} = $cpuid;
878bd335c64Sesolom			$ivec->{nowcpu} = $cpuid;
879bd335c64Sesolom			$ivec->{inum} = $inum;
880bd335c64Sesolom		}
881bd335c64Sesolom	}
882bd335c64Sesolom
883bd335c64Sesolom	# Sort the list of CPUs from highest to lowest interrupt load.
884bd335c64Sesolom	# Remove the top CPU from that list and attempt to redistribute
885bd335c64Sesolom	# its interrupts. If the CPU has a goodness below a threshold,
886bd335c64Sesolom	# just ignore the CPU and move to the next one. If the CPU's
887bd335c64Sesolom	# load falls below the average load plus that same threshold,
888bd335c64Sesolom	# then there are no CPUs left worth reconfiguring, and we're done.
889bd335c64Sesolom
890bd335c64Sesolom	while (@cpusortlist) {
891bd335c64Sesolom		# Re-sort cpusortlist each time, since do_reconfig_cpu can
892bd335c64Sesolom		# move interrupts around.
893bd335c64Sesolom
894bd335c64Sesolom		@cpusortlist =
895bd335c64Sesolom		    sort({$delta->{$b}{intrload} <=> $delta->{$a}{intrload}}
896bd335c64Sesolom		    @cpusortlist);
897bd335c64Sesolom
898bd335c64Sesolom		my $cpu = shift(@cpusortlist);
899bd335c64Sesolom		if (($delta->{$cpu}{intrload} <= $goodness_unsafe_load) &&
900bd335c64Sesolom		    ($delta->{$cpu}{intrload} <=
901bd335c64Sesolom		    $delta->{avgintrload} + $goodness_mindelta)) {
902bd335c64Sesolom			syslog('debug', "finished reconfig: cpu $cpu load ".
903bd335c64Sesolom			    "$delta->{$cpu}{intrload} avgload ".
904bd335c64Sesolom			    "$delta->{avgintrload}");
905bd335c64Sesolom			last;
906bd335c64Sesolom		}
907bd335c64Sesolom		if (goodness_cpu($delta->{$cpu}, $delta->{avgintrload}) <
908bd335c64Sesolom		    $goodness_mindelta) {
909bd335c64Sesolom			next;
910bd335c64Sesolom		}
911bd335c64Sesolom		do_reconfig_cpu($delta, \@cpusortlist, $cpu);
912bd335c64Sesolom	}
913bd335c64Sesolom
914bd335c64Sesolom	# How good a job did we do? If the improvement was minimal, and
915bd335c64Sesolom	# our goodness wasn't pathological (and thus needing any help it
916bd335c64Sesolom	# can get), then don't bother moving the interrupts.
917bd335c64Sesolom
918bd335c64Sesolom	my $newgoodness = goodness($delta);
919bd335c64Sesolom	VERIFY($newgoodness <= $goodness,
920bd335c64Sesolom	       "reconfig: result has worse goodness?");
921bd335c64Sesolom
922bd335c64Sesolom	if (($goodness != 1 || $newgoodness == 1) &&
923bd335c64Sesolom	    $goodness - $newgoodness < $goodness_mindelta) {
924bd335c64Sesolom		syslog('debug', "goodness already near optimum, ".
925bd335c64Sesolom		       "don't reconfig");
926bd335c64Sesolom		return (0);
927bd335c64Sesolom	}
928bd335c64Sesolom	syslog('debug', "goodness %5.2f%% --> %5.2f%%", $goodness*100,
929bd335c64Sesolom	       $newgoodness*100);
930bd335c64Sesolom
931bd335c64Sesolom	# Time to move those interrupts!
932bd335c64Sesolom
933bd335c64Sesolom	my $ret = 1;
934bd335c64Sesolom	my $warned = 0;
935bd335c64Sesolom	while (my ($cpuid, $cpu) = each %$delta) {
936bd335c64Sesolom		next if $cpuid =~ /\D/;
937bd335c64Sesolom		while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
938bd335c64Sesolom			next if ($ivec->{origcpu} == $cpuid);
939bd335c64Sesolom
940bd335c64Sesolom			if (!intrmove($ivec->{buspath}, $ivec->{ino},
9412917a9c9Sschwartz			    $cpuid, $ivec->{num_ino})) {
942bd335c64Sesolom				syslog('warning', "Unable to move interrupts")
943bd335c64Sesolom				    if $warned++ == 0;
944bd335c64Sesolom				syslog('debug', "Unable to move buspath ".
945bd335c64Sesolom				    "$ivec->{buspath} ino $ivec->{ino} to ".
946bd335c64Sesolom				    "cpu $cpuid");
947bd335c64Sesolom				$ret = -1;
948bd335c64Sesolom			}
949bd335c64Sesolom		}
950bd335c64Sesolom	}
951bd335c64Sesolom
952bd335c64Sesolom	syslog('notice', "Interrupt assignments optimized");
953bd335c64Sesolom	return ($ret);
954bd335c64Sesolom}
955bd335c64Sesolom
956bd335c64Sesolomsub do_reconfig_cpu($$$)	# private function
957bd335c64Sesolom{
958bd335c64Sesolom	my ($delta, $cpusortlist, $oldcpuid) = @_;
959bd335c64Sesolom
960bd335c64Sesolom	# We have been asked to rejuggle interrupts between $oldcpuid and
961bd335c64Sesolom	# other CPUs found on $cpusortlist so as to improve the load on
962bd335c64Sesolom	# $oldcpuid. We reverse $cpusortlist to get our own copy of the
963bd335c64Sesolom	# list, sorted from lowest to highest interrupt load. One at a
964bd335c64Sesolom	# time, shift a CPU off of this list of CPUs, and attempt to
965bd335c64Sesolom	# rejuggle interrupts between the two CPUs. Don't do this if the
966bd335c64Sesolom	# other CPU has a higher load than oldcpuid. We're done rejuggling
967bd335c64Sesolom	# once $oldcpuid's goodness falls below a threshold.
968bd335c64Sesolom
969bd335c64Sesolom	syslog('debug', "reconfiguring $oldcpuid");
970bd335c64Sesolom
971bd335c64Sesolom	my $cpu = $delta->{$oldcpuid};
972bd335c64Sesolom	my $avgintrload = $delta->{avgintrload};
973bd335c64Sesolom
974bd335c64Sesolom	my @cputargetlist = reverse(@$cpusortlist); # make a copy of the list
975bd335c64Sesolom	while ($#cputargetlist != -1) {
976bd335c64Sesolom 		last if goodness_cpu($cpu, $avgintrload) < $goodness_mindelta;
977bd335c64Sesolom
978bd335c64Sesolom		my $tgtcpuid = shift(@cputargetlist);
979bd335c64Sesolom		my $tgt = $delta->{$tgtcpuid};
980bd335c64Sesolom		my $load = $cpu->{intrload};
981bd335c64Sesolom		my $tgtload = $tgt->{intrload};
982bd335c64Sesolom		last if $tgtload > $load;
983bd335c64Sesolom		do_reconfig_cpu2cpu($delta, $oldcpuid, $tgtcpuid, $load);
984bd335c64Sesolom	}
985bd335c64Sesolom}
986bd335c64Sesolom
987bd335c64Sesolomsub do_reconfig_cpu2cpu($$$$)	# private function
988bd335c64Sesolom{
989bd335c64Sesolom	my ($delta, $srccpuid, $tgtcpuid, $srcload) = @_;
990bd335c64Sesolom
991bd335c64Sesolom	# We've been asked to consider interrupt juggling between srccpuid
992bd335c64Sesolom	# (with a high interrupt load) and tgtcpuid (with a lower interrupt
993bd335c64Sesolom	# load). First, make a single list with all of the ivecs from both
994bd335c64Sesolom	# CPUs, and sort the list from highest to lowest load.
995bd335c64Sesolom
996bd335c64Sesolom	syslog('debug', "exchanging intrs between $srccpuid and $tgtcpuid");
997bd335c64Sesolom
998bd335c64Sesolom	# Gather together all the ivecs and sort by load
999bd335c64Sesolom
1000bd335c64Sesolom	my @ivecs = (values(%{$delta->{$srccpuid}{ivecs}}),
1001bd335c64Sesolom	    values(%{$delta->{$tgtcpuid}{ivecs}}));
1002bd335c64Sesolom	return if $#ivecs == -1;
1003bd335c64Sesolom
1004bd335c64Sesolom	@ivecs = sort({$b->{time} <=> $a->{time}} @ivecs);
1005bd335c64Sesolom
1006bd335c64Sesolom	# Our "goal" load for srccpuid is the average load across all CPUs.
1007bd335c64Sesolom	# find_goal() will find determine the optimum selection of the
1008bd335c64Sesolom	# available interrupts which comes closest to this goal without
1009bd335c64Sesolom	# falling below the goal.
1010bd335c64Sesolom
1011bd335c64Sesolom	my $goal = $delta->{avgintrnsec};
1012bd335c64Sesolom
1013bd335c64Sesolom	# We know that the interrupt load on tgtcpuid is less than that on
1014bd335c64Sesolom	# srccpuid, but its load could still be above avgintrnsec. Don't
1015bd335c64Sesolom	# choose a goal which would bring srccpuid below the load on tgtcpuid.
1016bd335c64Sesolom
1017bd335c64Sesolom	my $avgnsec =
1018bd335c64Sesolom	    ($delta->{$srccpuid}{intrs} + $delta->{$tgtcpuid}{intrs}) / 2;
1019bd335c64Sesolom	if ($goal < $avgnsec) {
1020bd335c64Sesolom		$goal = $avgnsec;
1021bd335c64Sesolom	}
1022bd335c64Sesolom
1023bd335c64Sesolom	# If the largest of the interrupts is on srccpuid, leave it there.
1024bd335c64Sesolom	# This can help minimize the disruption caused by moving interrupts.
1025bd335c64Sesolom
1026bd335c64Sesolom	if ($ivecs[0]->{origcpu} == $srccpuid) {
1027bd335c64Sesolom		syslog('debug', "Keeping $ivecs[0]->{inum} on $srccpuid");
1028bd335c64Sesolom		$goal -= $ivecs[0]->{time};
1029bd335c64Sesolom		shift(@ivecs);
1030bd335c64Sesolom	}
1031bd335c64Sesolom
1032bd335c64Sesolom	syslog('debug', "GOAL: inums should total $goal");
1033bd335c64Sesolom	find_goal(\@ivecs, $goal);
1034bd335c64Sesolom
1035bd335c64Sesolom	# find_goal() returned its results to us by setting $ivec->{goal} if
1036bd335c64Sesolom	# the ivec should be on srccpuid, or clearing it for tgtcpuid.
1037bd335c64Sesolom	# Call move_intr() to update our $delta with the new results.
1038bd335c64Sesolom
1039bd335c64Sesolom	foreach my $ivec (@ivecs) {
1040bd335c64Sesolom		syslog('debug', "ivec $ivec->{inum} goal $ivec->{goal}");
1041bd335c64Sesolom		VERIFY($ivec->{nowcpu} == $srccpuid ||
1042bd335c64Sesolom		    $ivec->{nowcpu} == $tgtcpuid, "cpu2cpu found an ".
1043bd335c64Sesolom		    "interrupt not currently on src or tgt cpu");
1044bd335c64Sesolom
1045bd335c64Sesolom		if ($ivec->{goal} && $ivec->{nowcpu} != $srccpuid) {
1046bd335c64Sesolom			move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
1047bd335c64Sesolom			    $srccpuid);
1048bd335c64Sesolom		} elsif ($ivec->{goal} == 0 && $ivec->{nowcpu} != $tgtcpuid) {
1049bd335c64Sesolom			move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
1050bd335c64Sesolom			    $tgtcpuid);
1051bd335c64Sesolom		}
1052bd335c64Sesolom	}
1053bd335c64Sesolom	move_intr_check($delta, $srccpuid, $tgtcpuid); # asserts
1054bd335c64Sesolom
1055bd335c64Sesolom	my $newload = $delta->{$srccpuid}{intrs} / $delta->{$srccpuid}{tot};
1056bd335c64Sesolom	VERIFY($newload <= $srcload && $newload > $delta->{avgintrload},
1057bd335c64Sesolom	    "cpu2cpu: new load didn't end up in expected range");
1058bd335c64Sesolom}
1059bd335c64Sesolom
1060bd335c64Sesolom
1061bd335c64Sesolom# find_goal() and its helper do_find_goal() are used to find the best
1062bd335c64Sesolom# combination of interrupts in order to generate a load that is as close
1063bd335c64Sesolom# as possible to a goal load without falling below that goal. Before returning
1064bd335c64Sesolom# to its caller, find_goal() sets a new value in the hash of each interrupt,
1065bd335c64Sesolom# {goal}, which if set signifies that this interrupt is one of the interrupts
1066bd335c64Sesolom# identified as part of the set of interrupts which best meet the goal.
1067bd335c64Sesolom#
1068bd335c64Sesolom# The arguments to find_goal are a list of ivecs (hash references), sorted
1069bd335c64Sesolom# by descending {time}, and the goal load. The goal is relative to {time}.
1070bd335c64Sesolom# The best fit is determined by performing a depth-first search. do_find_goal
1071bd335c64Sesolom# is the recursive subroutine which carries out the search.
1072bd335c64Sesolom#
1073bd335c64Sesolom# It is passed an index as an argument, originally 0. On a given invocation,
1074bd335c64Sesolom# it is only to consider interrupts in the ivecs array starting at that index.
1075bd335c64Sesolom# It then considers two possibilities:
1076bd335c64Sesolom#   1) What is the best goal-fit if I include ivecs[index]?
1077bd335c64Sesolom#   2) What is the best goal-fit if I exclude ivecs[index]?
1078bd335c64Sesolom# To determine case 1, it subtracts the load of ivecs[index] from the goal,
1079bd335c64Sesolom# and calls itself recursively with that new goal and index++.
1080bd335c64Sesolom# To determine case 2, it calls itself recursively with the same goal and
1081bd335c64Sesolom# index++.
1082bd335c64Sesolom#
1083bd335c64Sesolom# It then compares the two results, decide which one best meets the goals,
1084bd335c64Sesolom# and returns the result. The return value is the best-fit's interrupt load,
1085bd335c64Sesolom# followed by a list of all the interrupts which make up that best-fit.
1086bd335c64Sesolom#
1087bd335c64Sesolom# As an optimization, a second array loads[] is created which mirrors ivecs[].
1088bd335c64Sesolom# loads[i] will equal the total loads of all ivecs[i..$#ivecs]. This is used
1089bd335c64Sesolom# by do_find_goal to avoid recursing all the way to the end of the ivecs
1090bd335c64Sesolom# array if including all remaining interrupts will still leave the best-fit
1091bd335c64Sesolom# at below goal load. If so, it then includes all remaining interrupts on
1092bd335c64Sesolom# the goal list and returns.
1093bd335c64Sesolom#
1094bd335c64Sesolomsub find_goal($$)		# private function
1095bd335c64Sesolom{
1096bd335c64Sesolom	my ($ivecs, $goal) = @_;
1097bd335c64Sesolom
1098bd335c64Sesolom	my @goals;
1099bd335c64Sesolom	my $load;
1100bd335c64Sesolom	my $ivec;
1101bd335c64Sesolom
1102bd335c64Sesolom	if ($goal <= 0) {
1103bd335c64Sesolom		@goals = ();	# the empty set will best meet the goal
1104bd335c64Sesolom	} else {
1105bd335c64Sesolom		syslog('debug', "finding goal from intrs %s",
1106bd335c64Sesolom		    ivecs_to_string(@$ivecs));
1107bd335c64Sesolom
1108bd335c64Sesolom		# Generate @loads array
1109bd335c64Sesolom
1110bd335c64Sesolom		my $tot = 0;
1111bd335c64Sesolom		foreach $ivec (@$ivecs) {
1112bd335c64Sesolom			$tot += $ivec->{time};
1113bd335c64Sesolom		}
1114bd335c64Sesolom		my @loads = ();
1115bd335c64Sesolom		foreach $ivec (@$ivecs) {
1116bd335c64Sesolom			push(@loads, $tot);
1117bd335c64Sesolom			$tot -= $ivec->{time};
1118bd335c64Sesolom		}
1119bd335c64Sesolom		($load, @goals) = do_find_goal($ivecs, \@loads, $goal, 0);
1120bd335c64Sesolom		VERIFY($load >= $goal, "find_goal didn't meet goals");
11215bb4956eSesolom	}
1122bd335c64Sesolom	syslog('debug', "goals found: %s", ivecs_to_string(@goals));
1123bd335c64Sesolom
1124bd335c64Sesolom	# Set or clear $ivec->{goal} for each ivec, based on returned @goals
1125bd335c64Sesolom
1126bd335c64Sesolom	foreach $ivec (@$ivecs) {
1127bd335c64Sesolom		if ($#goals > -1 && $ivec == $goals[0]) {
1128bd335c64Sesolom			syslog('debug', "inum $ivec->{inum} on source cpu");
1129bd335c64Sesolom			$ivec->{goal} = 1;
1130bd335c64Sesolom			shift(@goals);
1131bd335c64Sesolom		} else {
1132bd335c64Sesolom			syslog('debug', "inum $ivec->{inum} on target cpu");
1133bd335c64Sesolom			$ivec->{goal} = 0;
1134bd335c64Sesolom		}
1135bd335c64Sesolom	}
1136bd335c64Sesolom}
1137bd335c64Sesolom
1138bd335c64Sesolom
1139bd335c64Sesolomsub do_find_goal($$$$)		# private function
1140bd335c64Sesolom{
1141bd335c64Sesolom	my ($ivecs, $loads, $goal, $idx) = @_;
1142bd335c64Sesolom
1143bd335c64Sesolom	if ($idx > $#{$ivecs}) {
1144bd335c64Sesolom		return (0);
1145bd335c64Sesolom	}
1146bd335c64Sesolom	syslog('debug', "$idx: finding goal $goal inum $ivecs->[$idx]{inum}");
1147bd335c64Sesolom
1148bd335c64Sesolom	my $load = $ivecs->[$idx]{time};
1149bd335c64Sesolom	my @goals_with = ();
1150bd335c64Sesolom	my @goals_without = ();
1151bd335c64Sesolom	my ($with, $without);
1152bd335c64Sesolom
1153bd335c64Sesolom	# If we include all remaining items and we're still below goal,
1154bd335c64Sesolom	# stop here. We can just return a result that includes $idx and all
1155bd335c64Sesolom	# subsequent ivecs. Since this will still be below goal, there's
1156bd335c64Sesolom	# nothing better to be done.
1157bd335c64Sesolom
1158bd335c64Sesolom	if ($loads->[$idx] <= $goal) {
1159bd335c64Sesolom		syslog('debug',
1160bd335c64Sesolom		    "$idx: including all remaining intrs %s with load %d",
1161bd335c64Sesolom		    ivecs_to_string(@$ivecs[$idx .. $#{$ivecs}]),
1162bd335c64Sesolom		    $loads->[$idx]);
1163bd335c64Sesolom		return ($loads->[$idx], @$ivecs[$idx .. $#{$ivecs}]);
1164bd335c64Sesolom	}
1165bd335c64Sesolom
1166bd335c64Sesolom	# Evaluate the "with" option, i.e. the best matching goal which
1167bd335c64Sesolom	# includes $ivecs->[$idx]. If idx's load is more than our goal load,
1168bd335c64Sesolom	# stop here. Once we're above the goal, there is no need to consider
1169bd335c64Sesolom	# further interrupts since they'll only take us further from the goal.
1170bd335c64Sesolom
1171bd335c64Sesolom	if ($goal <= $load) {
1172bd335c64Sesolom		$with = $load;	# stop here
1173bd335c64Sesolom	} else {
1174bd335c64Sesolom		($with, @goals_with) =
1175bd335c64Sesolom		    do_find_goal($ivecs, $loads, $goal - $load, $idx + 1);
1176bd335c64Sesolom		$with += $load;
1177bd335c64Sesolom	}
1178bd335c64Sesolom	syslog('debug', "$idx: with-load $with intrs %s",
1179bd335c64Sesolom	       ivecs_to_string($ivecs->[$idx], @goals_with));
1180bd335c64Sesolom
1181bd335c64Sesolom	# Evaluate the "without" option, i.e. the best matching goal which
1182bd335c64Sesolom	# excludes $ivecs->[$idx].
1183bd335c64Sesolom
1184bd335c64Sesolom	($without, @goals_without) =
1185bd335c64Sesolom	    &do_find_goal($ivecs, $loads, $goal, $idx + 1);
1186bd335c64Sesolom	syslog('debug', "$idx: without-load $without intrs %s",
1187bd335c64Sesolom	       ivecs_to_string(@goals_without));
1188bd335c64Sesolom
1189bd335c64Sesolom	# We now have our "with" and "without" options, and we choose which
1190bd335c64Sesolom	# best fits the goal. If one is greater than goal and the other is
1191bd335c64Sesolom	# below goal, we choose the one that is greater. If they are both
1192bd335c64Sesolom	# below goal, then we choose the one that is greater. If they are
1193bd335c64Sesolom	# both above goal, then we choose the smaller.
1194bd335c64Sesolom
1195bd335c64Sesolom	my $which;		# 0 == with, 1 == without
1196bd335c64Sesolom	if ($with >= $goal && $without < $goal) {
1197bd335c64Sesolom		$which = 0;
1198bd335c64Sesolom	} elsif ($with < $goal && $without >= $goal) {
1199bd335c64Sesolom		$which = 1;
1200bd335c64Sesolom	} elsif ($with >= $goal && $without >= $goal) {
1201bd335c64Sesolom		$which = ($without < $with);
1202bd335c64Sesolom	} else {
1203bd335c64Sesolom		$which = ($without > $with);
1204bd335c64Sesolom	}
1205bd335c64Sesolom
1206bd335c64Sesolom	# Return the load of our best case scenario, followed by all the ivecs
1207bd335c64Sesolom	# which compose that goal.
1208bd335c64Sesolom
1209bd335c64Sesolom	if ($which == 1) {	# without
1210bd335c64Sesolom		syslog('debug', "$idx: going without");
1211bd335c64Sesolom		return ($without, @goals_without);
1212bd335c64Sesolom	} else {
1213bd335c64Sesolom		syslog('debug', "$idx: going with");
1214bd335c64Sesolom		return ($with, $ivecs->[$idx], @goals_with);
1215bd335c64Sesolom	}
1216bd335c64Sesolom	# Not reached
1217bd335c64Sesolom}
1218bd335c64Sesolom
1219bd335c64Sesolom
1220bd335c64Sesolom
1221bd335c64Sesolom
1222bd335c64Sesolomsyslog('debug', "intrd is starting".($debug ? " (debug)" : ""));
1223bd335c64Sesolom
1224bd335c64Sesolommy @deltas = ();
1225bd335c64Sesolommy $deltas_tottime = 0;		# sum of maxsnap-minsnap across @deltas
1226bd335c64Sesolommy $avggoodness;
1227bd335c64Sesolommy $baseline_goodness = 0;
1228bd335c64Sesolommy $compdelta;
1229bd335c64Sesolom
1230bd335c64Sesolommy $do_reconfig;
1231bd335c64Sesolom
1232bd335c64Sesolom# temp variables
1233bd335c64Sesolommy $goodness;
1234bd335c64Sesolommy $deltatime;
1235bd335c64Sesolommy $olddelta;
1236bd335c64Sesolommy $olddeltatime;
1237bd335c64Sesolommy $delta;
1238bd335c64Sesolommy $newstat;
1239bd335c64Sesolommy $below_statslen;
1240bd335c64Sesolommy $newtime;
1241bd335c64Sesolommy $ret;
1242bd335c64Sesolom
1243bd335c64Sesolom
1244bd335c64Sesolommy $gotsig = 0;
1245bd335c64Sesolom$SIG{INT} = sub { $gotsig = 1; };     # don't die in the middle of retargeting
1246bd335c64Sesolom$SIG{HUP} = $SIG{INT};
1247bd335c64Sesolom$SIG{TERM} = $SIG{INT};
1248bd335c64Sesolom
1249bd335c64Sesolommy $ks;
1250bd335c64Sesolomif ($using_scengen == 0) {
1251bd335c64Sesolom	$ks = Sun::Solaris::Kstat->new();
1252bd335c64Sesolom} else {
1253bd335c64Sesolom	$ks = myks_update();	# supplied by the simulator
1254bd335c64Sesolom}
1255bd335c64Sesolom
1256bd335c64Sesolom# If no pci_intrs kstats were found, we need to exit, but we can't because
1257bd335c64Sesolom# SMF will restart us and/or report an error to the administrator. But
1258bd335c64Sesolom# there's nothing an administrator can do. So print out a message for SMF
1259bd335c64Sesolom# logs and silently pause forever.
1260bd335c64Sesolom
1261bd335c64Sesolomif (!exists($ks->{pci_intrs})) {
1262bd335c64Sesolom	print STDERR "$cmdname: no interrupts were found; ".
1263bd335c64Sesolom	    "your PCI bus may not yet be supported\n";
1264bd335c64Sesolom	pause() while $gotsig == 0;
1265bd335c64Sesolom	exit 0;
1266bd335c64Sesolom}
1267bd335c64Sesolom
12682917a9c9Sschwartz# See if this is a system with a pcplusmp APIC.
12692917a9c9Sschwartz# Such systems will get special handling.
12702917a9c9Sschwartz# Assume that if one bus has a pcplusmp APIC that they all do.
1271bd335c64Sesolom
12722917a9c9Sschwartz# Get a list of pci_intrs kstats.
12732917a9c9Sschwartzmy @elem = values(%{$ks->{pci_intrs}});
12742917a9c9Sschwartzmy $elem0 = $elem[0];
12752917a9c9Sschwartzmy $elemval = (values(%$elem0))[0];
1276bd335c64Sesolom
12772917a9c9Sschwartz# Use its buspath to query the system.  It is assumed that either all or none
12782917a9c9Sschwartz# of the busses on a system are hosted by the pcplusmp APIC.
12792917a9c9Sschwartzmy $pcplusmp_sys = is_pcplusmp($elemval->{buspath});
12802917a9c9Sschwartz
12812917a9c9Sschwartzmy $stat = getstat($ks, $pcplusmp_sys);
1282bd335c64Sesolom
1283bd335c64Sesolomfor (;;) {
1284bd335c64Sesolom	sub clear_deltas {
1285bd335c64Sesolom		@deltas = ();
1286bd335c64Sesolom		$deltas_tottime = 0;
1287bd335c64Sesolom		$stat = 0;   # prevent next gen_delta() from setting {missing}
1288bd335c64Sesolom	}
1289bd335c64Sesolom
1290bd335c64Sesolom	# 1. Sleep, update the kstats, and save the new stats in $newstat.
1291bd335c64Sesolom
1292bd335c64Sesolom	exit 0 if $gotsig;		# if we got ^C / SIGTERM, exit
1293bd335c64Sesolom	if ($using_scengen == 0) {
1294bd335c64Sesolom		sleep($sleeptime);
1295bd335c64Sesolom		exit 0 if $gotsig;	# if we got ^C / SIGTERM, exit
1296bd335c64Sesolom		$ks->update();
1297bd335c64Sesolom	} else {
1298bd335c64Sesolom		$ks = myks_update();
1299bd335c64Sesolom	}
13002917a9c9Sschwartz	$newstat = getstat($ks, $pcplusmp_sys);
1301bd335c64Sesolom
1302bd335c64Sesolom	# $stat or $newstat could be zero if they're uninitialized, or if
1303bd335c64Sesolom	# getstat() failed. If $stat is zero, move $newstat to $stat, sleep
1304bd335c64Sesolom	# and try again. If $newstat is zero, then we also sleep and try
1305bd335c64Sesolom	# again, hoping the problem will clear up.
1306bd335c64Sesolom
1307bd335c64Sesolom	next if (!ref $newstat);
1308bd335c64Sesolom	if (!ref $stat) {
1309bd335c64Sesolom		$stat = $newstat;
1310bd335c64Sesolom		next;
1311bd335c64Sesolom	}
1312bd335c64Sesolom
1313bd335c64Sesolom	# 2. Compare $newstat with the prior set of values, result in %$delta.
1314bd335c64Sesolom
1315bd335c64Sesolom	$delta = generate_delta($stat, $newstat);
1316bd335c64Sesolom	dumpdelta($delta) if $debug;	# Dump most recent stats to stdout.
1317bd335c64Sesolom	$stat = $newstat;	# The new stats now become the old stats.
1318bd335c64Sesolom
1319bd335c64Sesolom
1320bd335c64Sesolom	# 3. If $delta->{missing}, then there has been a reconfiguration of
1321bd335c64Sesolom	# either cpus or interrupts (probably both). We need to toss out our
1322bd335c64Sesolom	# old set of statistics and start from scratch.
1323bd335c64Sesolom	#
1324bd335c64Sesolom	# Also, if the delta covers a very long range of time, then we've
1325bd335c64Sesolom	# been experiencing a system overload that has resulted in intrd
1326bd335c64Sesolom	# not being allowed to run effectively for a while now. As above,
1327bd335c64Sesolom	# toss our old statistics and start from scratch.
1328bd335c64Sesolom
1329bd335c64Sesolom	$deltatime = $delta->{maxsnap} - $delta->{minsnap};
1330bd335c64Sesolom	if ($delta->{missing} > 0 || $deltatime > $statslen) {
1331bd335c64Sesolom		clear_deltas();
1332bd335c64Sesolom		syslog('debug', "evaluating interrupt assignments");
1333bd335c64Sesolom		next;
1334bd335c64Sesolom	}
1335bd335c64Sesolom
1336bd335c64Sesolom
1337bd335c64Sesolom	# 4. Incorporate new delta into the list of deltas, and associated
1338bd335c64Sesolom	# statistics. If we've just now received $statslen deltas, then it's
1339bd335c64Sesolom	# time to evaluate a reconfiguration.
1340bd335c64Sesolom
1341bd335c64Sesolom	$below_statslen = ($deltas_tottime < $statslen);
1342bd335c64Sesolom	$deltas_tottime += $deltatime;
1343bd335c64Sesolom	$do_reconfig = ($below_statslen && $deltas_tottime >= $statslen);
1344bd335c64Sesolom	push(@deltas, $delta);
1345bd335c64Sesolom
1346bd335c64Sesolom	# 5. Remove old deltas if total time is more than $statslen. We use
1347bd335c64Sesolom	# @deltas as a moving average of the last $statslen seconds. Shift
1348bd335c64Sesolom	# off the olders deltas, but only if that doesn't cause us to fall
1349bd335c64Sesolom	# below $statslen seconds.
1350bd335c64Sesolom
1351bd335c64Sesolom	while (@deltas > 1) {
1352bd335c64Sesolom		$olddelta = $deltas[0];
1353bd335c64Sesolom		$olddeltatime = $olddelta->{maxsnap} - $olddelta->{minsnap};
1354bd335c64Sesolom		$newtime = $deltas_tottime - $olddeltatime;
1355bd335c64Sesolom		last if ($newtime < $statslen);
1356bd335c64Sesolom
1357bd335c64Sesolom		shift(@deltas);
1358bd335c64Sesolom		$deltas_tottime = $newtime;
1359bd335c64Sesolom	}
1360bd335c64Sesolom
1361bd335c64Sesolom	# 6. The brains of the operation are here. First, check if we're
1362bd335c64Sesolom	# imbalanced, and if so set $do_reconfig. If $do_reconfig is set,
1363bd335c64Sesolom	# either because of imbalance or above in step 4, we evaluate a
1364bd335c64Sesolom	# new configuration.
1365bd335c64Sesolom	#
1366bd335c64Sesolom	# First, take @deltas and generate a single "compressed" delta
1367bd335c64Sesolom	# which summarizes them all. Pass that to do_reconfig and see
1368bd335c64Sesolom	# what it does with it:
1369bd335c64Sesolom	#
1370bd335c64Sesolom	# $ret == -1 : failure
1371bd335c64Sesolom	# $ret ==  0 : current config is optimal (or close enough)
1372bd335c64Sesolom	# $ret ==  1 : reconfiguration has occurred
1373bd335c64Sesolom	#
1374bd335c64Sesolom	# If $ret is -1 or 1, dump all our deltas and start from scratch.
1375bd335c64Sesolom	# Step 4 above will set do_reconfig soon thereafter.
1376bd335c64Sesolom	#
1377bd335c64Sesolom	# If $ret is 0, then nothing has happened because we're already
1378bd335c64Sesolom	# good enough. Set baseline_goodness to current goodness.
1379bd335c64Sesolom
1380bd335c64Sesolom	$compdelta = compress_deltas(\@deltas);
1381bd335c64Sesolom	if (VERIFY(ref($compdelta) eq "HASH", "couldn't compress deltas")) {
1382bd335c64Sesolom		clear_deltas();
1383bd335c64Sesolom		next;
1384bd335c64Sesolom	}
1385bd335c64Sesolom	$compdelta->{goodness} = goodness($compdelta);
1386bd335c64Sesolom	dumpdelta($compdelta) if $debug;
1387bd335c64Sesolom
1388bd335c64Sesolom	$goodness = $compdelta->{goodness};
1389bd335c64Sesolom	syslog('debug', "GOODNESS: %5.2f%%", $goodness * 100);
1390bd335c64Sesolom
1391bd335c64Sesolom	if ($deltas_tottime >= $statslen &&
1392bd335c64Sesolom	    imbalanced($goodness, $baseline_goodness)) {
1393bd335c64Sesolom		$do_reconfig = 1;
1394bd335c64Sesolom	}
1395bd335c64Sesolom
1396bd335c64Sesolom	if ($do_reconfig) {
1397bd335c64Sesolom		$ret = do_reconfig($compdelta);
1398bd335c64Sesolom
1399bd335c64Sesolom		if ($ret != 0) {
1400bd335c64Sesolom			clear_deltas();
1401bd335c64Sesolom			syslog('debug', "do_reconfig FAILED!") if $ret == -1;
1402bd335c64Sesolom		} else {
1403bd335c64Sesolom			syslog('debug', "setting new baseline of $goodness");
1404bd335c64Sesolom			$baseline_goodness = $goodness;
1405bd335c64Sesolom		}
1406bd335c64Sesolom	}
1407bd335c64Sesolom	syslog('debug', "---------------------------------------");
1408bd335c64Sesolom}
1409