xref: /illumos-gate/usr/src/cmd/intrd/intrd.pl (revision 1007fd6fd24227460e77ce89f5ca85641a85a576)
1#!/usr/perl5/bin/perl
2#
3# CDDL HEADER START
4#
5# The contents of this file are subject to the terms of the
6# Common Development and Distribution License (the "License").
7# You may not use this file except in compliance with the License.
8#
9# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10# or http://www.opensolaris.org/os/licensing.
11# See the License for the specific language governing permissions
12# and limitations under the License.
13#
14# When distributing Covered Code, include this CDDL HEADER in each
15# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16# If applicable, add the following below this CDDL HEADER, with the
17# fields enclosed by brackets "[]" replaced with your own identifying
18# information: Portions Copyright [yyyy] [name of copyright owner]
19#
20# CDDL HEADER END
21#
22
23#
24# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25#
26
27require 5.8.4;
28use strict;
29use warnings;
30use POSIX;
31use File::Basename("basename");
32
33my $cmdname = basename($0);
34
35my $using_scengen = 0;	# 1 if using scenario simulator
36my $debug = 0;
37
38my $normal_sleeptime = 10;		# time to sleep between samples
39my $idle_sleeptime = 45;		# time to sleep when idle
40my $onecpu_sleeptime = (60 * 15);	# used if only 1 CPU on system
41my $sleeptime = $normal_sleeptime;	# either normal_ or idle_ or onecpu_
42
43my $idle_intrload = .1;			# idle if interrupt load < 10%
44
45my $timerange_toohi    = .01;
46my $statslen = 60;	# time period (in secs) to keep in @deltas
47
48
49# Parse arguments. intrd does not accept any public arguments; the two
50# arguments below are meant for testing purposes. -D generates a significant
51# amount of syslog output. -S <filename> loads the filename as a perl
52# script. That file is expected to implement a kstat "simulator" which
53# can be used to feed information to intrd and verify intrd's responses.
54
55while ($_ = shift @ARGV) {
56	if ($_ eq "-S" && $#ARGV != -1) {
57		$using_scengen = 1;
58		do $ARGV[0];	# load simulator
59		shift @ARGV;
60	} elsif ($_ eq "-D") {
61		$debug = 1;
62	}
63}
64
65if ($using_scengen == 0) {
66	require Sun::Solaris::Kstat;
67	require Sun::Solaris::Intrs;
68	import Sun::Solaris::Intrs(qw(intrmove is_apic));
69	require Sys::Syslog;
70	import Sys::Syslog;
71	openlog($cmdname, 'pid', 'daemon');
72	setlogmask(Sys::Syslog::LOG_UPTO($debug > 0 ? &Sys::Syslog::LOG_DEBUG :
73	    &Sys::Syslog::LOG_INFO));
74}
75
76my $asserted = 0;
77my $assert_level = 'debug';	# syslog level for assertion failures
78sub VERIFY($@)
79{
80	my $bad = (shift() == 0);	# $_[0] == 0 means assert failed
81	if ($bad) {
82		my $msg = shift();
83		syslog($assert_level, "VERIFY: $msg", @_);
84		$asserted++;
85	}
86	return ($bad);
87}
88
89
90
91
92sub getstat($$);
93sub generate_delta($$);
94sub compress_deltas($);
95sub dumpdelta($);
96
97sub goodness($);
98sub imbalanced($$);
99sub do_reconfig($);
100
101sub goodness_cpu($$);		# private function
102sub move_intr($$$$);		# private function
103sub ivecs_to_string(@);		# private function
104sub do_find_goal($$$$);		# private function
105sub find_goal($$);		# private function
106sub do_reconfig_cpu2cpu($$$$);	# private function
107sub do_reconfig_cpu($$$);	# private function
108
109
110#
111# What follow are the basic data structures routines of intrd.
112#
113# getstat() is responsible for reading the kstats and generating a "stat" hash.
114#
115# generate_delta() is responsible for taking two "stat" hashes and creating
116# a new "delta" hash that represents what has changed over time.
117#
118# compress_deltas() is responsible for taking a list of deltas and generating
119# a single delta hash that encompasses all the time periods described by the
120# deltas.
121
122
123#
124# getstat() is handed a reference to a kstat and generates a hash, returned
125# by reference, containing all the fields from the kstats which we need.
126# If it returns the scalar 0, it failed to gather the kstats, and the caller
127# should react accordingly.
128#
129# getstat() is also responsible for maintaining a reasonable $sleeptime.
130#
131# {"snaptime"}          kstat's snaptime
132# {<cpuid>}             one hash reference per online cpu
133#  ->{"tot"}            == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle}
134#  ->{"crtime"}         == cpu:<cpuid>:sys:crtime
135#  ->{"ivecs"}
136#     ->{<cookie#>}     iterates over pci_intrs::<nexus>:cookie
137#        ->{"time"}     == pci_intrs:<ivec#>:<nexus>:time (in nsec)
138#        ->{"pil"}      == pci_intrs:<ivec#>:<nexus>:pil
139#        ->{"crtime"}   == pci_intrs:<ivec#>:<nexus>:crtime
140#        ->{"ino"}      == pci_intrs:<ivec#>:<nexus>:ino
141#        ->{"num_ino"}  == num inos of single device instance sharing this entry
142#				Will be > 1 on pcplusmp X86 systems for devices
143#				with multiple MSI interrupts.
144#        ->{"buspath"}  == pci_intrs:<ivec#>:<nexus>:buspath
145#        ->{"name"}     == pci_intrs:<ivec#>:<nexus>:name
146#        ->{"ihs"}      == pci_intrs:<ivec#>:<nexus>:ihs
147#
148
149sub getstat($$)
150{
151	my ($ks, $pcplusmp_sys) = @_;
152
153	my $cpucnt = 0;
154	my %stat = ();
155	my ($minsnap, $maxsnap);
156
157	# Hash of hash which matches (MSI device, ino) combos to kstats.
158	my %msidevs = ();
159
160	# kstats are not generated atomically. Each kstat hierarchy will
161	# have been generated within the kernel at a different time. On a
162	# thrashing system, we may not run quickly enough in order to get
163	# coherent kstat timing information across all the kstats. To
164	# determine if this is occurring, $minsnap/$maxsnap are used to
165	# find the breadth between the first and last snaptime of all the
166	# kstats we access. $maxsnap - $minsnap roughly represents the
167	# total time taken up in getstat(). If this time approaches the
168	# time between snapshots, our results may not be useful.
169
170	$minsnap = -1;		# snaptime is always a positive number
171	$maxsnap = $minsnap;
172
173	# Iterate over the cpus in cpu:<cpuid>::. Check
174	# cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the
175	# processor is "on-line". If not, it isn't accepting interrupts
176	# and doesn't concern us.
177	#
178	# Record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap.
179
180	while (my ($cpu, $cpst) = each %{$ks->{cpu}}) {
181		next if !exists($ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state});
182		#"state" fld of kstat w/
183		#		  modname    inst name-"cpuinfo0"
184		my $state = $ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state};
185		next if ($state !~ /^on-line\0/);
186		my $cpu_sys = $cpst->{sys};
187
188		$stat{$cpu}{tot} = ($cpu_sys->{cpu_nsec_idle} +
189				    $cpu_sys->{cpu_nsec_user} +
190				    $cpu_sys->{cpu_nsec_kernel});
191		$stat{$cpu}{crtime} = $cpu_sys->{crtime};
192		$stat{$cpu}{ivecs} = {};
193
194		if ($minsnap == -1 || $cpu_sys->{snaptime} < $minsnap) {
195			$minsnap = $cpu_sys->{snaptime};
196		}
197		if ($cpu_sys->{snaptime} > $maxsnap) {
198			$maxsnap = $cpu_sys->{snaptime};
199		}
200		$cpucnt++;
201	}
202
203	if ($cpucnt <= 1) {
204		$sleeptime = $onecpu_sleeptime;
205		return (0);	# nothing to do with 1 CPU
206	}
207
208	# Iterate over the ivecs. If the cpu is not on-line, ignore the
209	# ivecs mapped to it, if any.
210	#
211	# Record pci_intrs:{inum}:<nexus>:time, snaptime, crtime, pil,
212	# ino, name, and buspath. Check $minsnap/$maxsnap.
213
214	foreach my $inst (values(%{$ks->{pci_intrs}})) {
215		my $intrcfg = (values(%$inst))[0];
216		my $cpu = $intrcfg->{cpu};
217
218		next unless exists $stat{$cpu};
219		next if ($intrcfg->{type} =~ /^disabled\0/);
220
221		# Perl looks beyond NULL chars in pattern matching.
222		# Truncate name field at the first NULL
223		$intrcfg->{name} =~ s/\0.*$//;
224
225		if ($intrcfg->{snaptime} < $minsnap) {
226			$minsnap = $intrcfg->{snaptime};
227		} elsif ($intrcfg->{snaptime} > $maxsnap) {
228			$maxsnap = $intrcfg->{snaptime};
229		}
230
231		my $cookie = "$intrcfg->{buspath} $intrcfg->{ino}";
232		if (exists $stat{$cpu}{ivecs}{$cookie}) {
233			my $cookiestats = $stat{$cpu}{ivecs}{$cookie};
234
235			$cookiestats->{time} += $intrcfg->{time};
236			$cookiestats->{name} .= "/$intrcfg->{name}";
237
238			# If this new interrupt sharing $cookie represents a
239			# change from an earlier getstat, make sure that
240			# generate_delta will see the change by setting
241			# crtime to the most recent crtime of its components.
242
243			if ($intrcfg->{crtime} > $cookiestats->{crtime}) {
244				$cookiestats->{crtime} = $intrcfg->{crtime};
245			}
246			$cookiestats->{ihs}++;
247			next;
248		}
249		$stat{$cpu}{ivecs}{$cookie}{time} = $intrcfg->{time};
250		$stat{$cpu}{ivecs}{$cookie}{crtime} = $intrcfg->{crtime};
251		$stat{$cpu}{ivecs}{$cookie}{pil} = $intrcfg->{pil};
252		$stat{$cpu}{ivecs}{$cookie}{ino} = $intrcfg->{ino};
253		$stat{$cpu}{ivecs}{$cookie}{num_ino} = 1;
254		$stat{$cpu}{ivecs}{$cookie}{buspath} = $intrcfg->{buspath};
255		$stat{$cpu}{ivecs}{$cookie}{name} = $intrcfg->{name};
256		$stat{$cpu}{ivecs}{$cookie}{ihs} = 1;
257
258		if ($pcplusmp_sys && ($intrcfg->{type} =~ /^msi\0/)) {
259			if (!(exists($msidevs{$intrcfg->{name}}))) {
260				$msidevs{$intrcfg->{name}} = {};
261			}
262			$msidevs{$intrcfg->{name}}{$intrcfg->{ino}} =
263			    \$stat{$cpu}{ivecs}{$cookie};
264		}
265	}
266
267	# All MSI interrupts of a device instance share a single MSI address.
268	# On X86 systems with an APIC, this MSI address is interpreted as CPU
269	# routing info by the APIC.  For this reason, on these platforms, all
270	# interrupts for MSI devices must be moved to the same CPU at the same
271	# time.
272	#
273	# Since all interrupts will be on the same CPU on these platforms, all
274	# interrupts can be consolidated into one ivec entry.  For such devices,
275	# num_ino will be > 1 to denote that a group move is needed.
276
277	# Loop thru all MSI devices on X86 pcplusmp systems.
278	# Nop on other systems.
279	foreach my $msidevkey (sort keys %msidevs) {
280
281		# Loop thru inos of the device, sorted by lowest value first
282		# For each cookie found for a device, incr num_ino for the
283		# lowest cookie and remove other cookies.
284
285		# Assumes PIL is the same for first and current cookies
286
287		my $first_ino = -1;
288		my $first_cookiep;
289		my $curr_cookiep;
290		foreach my $inokey (sort keys %{$msidevs{$msidevkey}}) {
291			$curr_cookiep = $msidevs{$msidevkey}{$inokey};
292			if ($first_ino == -1) {
293				$first_ino = $inokey;
294				$first_cookiep = $curr_cookiep;
295			} else {
296				$$first_cookiep->{num_ino}++;
297				$$first_cookiep->{time} +=
298				    $$curr_cookiep->{time};
299				if ($$curr_cookiep->{crtime} >
300				    $$first_cookiep->{crtime}) {
301					$$first_cookiep->{crtime} =
302					    $$curr_cookiep->{crtime};
303				}
304				# Invalidate this cookie, less complicated and
305				# more efficient than deleting it.
306				$$curr_cookiep->{num_ino} = 0;
307			}
308		}
309	}
310
311	# We define the timerange as the amount of time spent gathering the
312	# various kstats, divided by our sleeptime. If we take a lot of time
313	# to access the kstats, and then we create a delta comparing these
314	# kstats with a prior set of kstats, that delta will cover
315	# substaintially different amount of time depending upon which
316	# interrupt or CPU is being examined.
317	#
318	# By checking the timerange here, we guarantee that any deltas
319	# created from these kstats will contain self-consistent data,
320	# in that all CPUs and interrupts cover a similar span of time.
321	#
322	# $timerange_toohi is the upper bound. Any timerange above
323	# this is thrown out as garbage. If the stat is safely within this
324	# bound, we treat the stat as representing an instant in time, rather
325	# than the time range it actually spans. We arbitrarily choose minsnap
326	# as the snaptime of the stat.
327
328	$stat{snaptime} = $minsnap;
329	my $timerange = ($maxsnap - $minsnap) / $sleeptime;
330	return (0) if ($timerange > $timerange_toohi);	# i.e. failure
331	return (\%stat);
332}
333
334#
335# dumpdelta takes a reference to our "delta" structure:
336# {"missing"}           "1" if the delta's component stats had inconsistencies
337# {"minsnap"}           time of the first kstat snaptime used in this delta
338# {"maxsnap"}           time of the last kstat snaptime used in this delta
339# {"goodness"}          cost function applied to this delta
340# {"avgintrload"}       avg of interrupt load across cpus, as a percentage
341# {"avgintrnsec"}       avg number of nsec spent in interrupts, per cpu
342# {<cpuid>}             iterates over on-line cpus
343#  ->{"intrs"}          cpu's movable intr time (sum of "time" for each ivec)
344#  ->{"tot"}            CPU load from all sources in nsec
345#  ->{"bigintr"}        largest value of {ivecs}{<ivec#>}{time} from below
346#  ->{"intrload"}       intrs / tot
347#  ->{"ivecs"}
348#     ->{<ivec#>}       iterates over ivecs for this cpu
349#        ->{"time"}     time used by this interrupt (in nsec)
350#        ->{"pil"}      pil level of this interrupt
351#        ->{"ino"}      interrupt number (or base vector if MSI group)
352#        ->{"buspath"}  filename of the directory of the device's bus
353#        ->{"name"}     device name
354#        ->{"ihs"}      number of different handlers sharing this ino
355#        ->{"num_ino"}  number of interrupt vectors in MSI group
356#
357# It prints out the delta structure in a nice, human readable display.
358#
359
360sub dumpdelta($)
361{
362	my ($delta) = @_;
363
364	# print global info
365
366	syslog('debug', "dumpdelta:");
367	syslog('debug', " RECONFIGURATION IN DELTA") if $delta->{missing} > 0;
368	syslog('debug', " avgintrload: %5.2f%%  avgintrnsec: %d",
369	       $delta->{avgintrload} * 100, $delta->{avgintrnsec});
370	syslog('debug', "    goodness: %5.2f%%", $delta->{goodness} * 100)
371	    if exists($delta->{goodness});
372
373	# iterate over cpus
374
375	while (my ($cpu, $cpst) = each %$delta) {
376		next if !ref($cpst);		# skip non-cpuid entries
377		my $tot = $cpst->{tot};
378		syslog('debug', "    cpu %3d intr %7.3f%%  (bigintr %7.3f%%)",
379		       $cpu, $cpst->{intrload}*100, $cpst->{bigintr}*100/$tot);
380		syslog('debug', "        intrs %d, bigintr %d",
381		       $cpst->{intrs}, $cpst->{bigintr});
382
383		# iterate over ivecs on this cpu
384
385		while (my ($ivec, $ivst) = each %{$cpst->{ivecs}}) {
386			syslog('debug', "    %15s:\"%s\": %7.3f%%  %d",
387			    ($ivst->{ihs} > 1 ? "$ivst->{name}($ivst->{ihs})" :
388			    $ivst->{name}), $ivec,
389			    $ivst->{time}*100 / $tot, $ivst->{time});
390		}
391	}
392}
393
394#
395# generate_delta($stat, $newstat) takes two stat references, returned from
396# getstat(), and creates a %delta. %delta (not surprisingly) contains the
397# same basic info as stat and newstat, but with the timestamps as deltas
398# instead of absolute times. We return a reference to the delta.
399#
400
401sub generate_delta($$)
402{
403	my ($stat, $newstat) = @_;
404
405	my %delta = ();
406	my $intrload;
407	my $intrnsec;
408	my $cpus;
409
410	# Take the worstcase timerange
411	$delta{minsnap} = $stat->{snaptime};
412	$delta{maxsnap} = $newstat->{snaptime};
413	if (VERIFY($delta{maxsnap} > $delta{minsnap},
414	    "generate_delta: stats aren't ascending")) {
415		$delta{missing} = 1;
416		return (\%delta);
417	}
418
419	# if there are a different number of cpus in the stats, set missing
420
421	$delta{missing} = (keys(%$stat) != keys(%$newstat));
422	if (VERIFY($delta{missing} == 0,
423	    "generate_delta: number of CPUs changed")) {
424		return (\%delta);
425	}
426
427	# scan through every cpu in %newstat and compare against %stat
428
429	while (my ($cpu, $newcpst) = each %$newstat) {
430		next if !ref($newcpst);		# skip non-cpuid fields
431
432		# If %stat is missing a cpu from %newstat, then it was just
433		# onlined. Mark missing.
434
435		if (VERIFY(exists $stat->{$cpu} &&
436		    $stat->{$cpu}{crtime} == $newcpst->{crtime},
437		    "generate_delta: cpu $cpu changed")) {
438			$delta{missing} = 1;
439			return (\%delta);
440		}
441		my $cpst = $stat->{$cpu};
442		$delta{$cpu}{tot} = $newcpst->{tot} - $cpst->{tot};
443		if (VERIFY($delta{$cpu}{tot} >= 0,
444		    "generate_delta: deltas are not ascending?")) {
445			$delta{missing} = 1;
446			delete($delta{$cpu});
447			return (\%delta);
448		}
449		# Avoid remote chance of division by zero
450		$delta{$cpu}{tot} = 1 if $delta{$cpu}{tot} == 0;
451		$delta{$cpu}{intrs} = 0;
452		$delta{$cpu}{bigintr} = 0;
453
454		my %ivecs = ();
455		$delta{$cpu}{ivecs} = \%ivecs;
456
457		# if the number of ivecs differs, set missing
458
459		if (VERIFY(keys(%{$cpst->{ivecs}}) ==
460			   keys(%{$newcpst->{ivecs}}),
461			   "generate_delta: cpu $cpu has more/less".
462			   " interrupts")) {
463			$delta{missing} = 1;
464			return (\%delta);
465		}
466
467		while (my ($inum, $newivec) = each %{$newcpst->{ivecs}}) {
468
469			# Unused cookie, corresponding to an MSI vector which
470			# is part of a group.  The whole group is accounted for
471			# by a different cookie.
472			next if ($newivec->{num_ino} == 0);
473
474			# If this ivec doesn't exist in $stat, or if $stat
475			# shows a different crtime, set missing.
476			if (VERIFY(exists $cpst->{ivecs}{$inum} &&
477				   $cpst->{ivecs}{$inum}{crtime} ==
478				   $newivec->{crtime},
479				   "generate_delta: cpu $cpu inum $inum".
480				   " has changed")) {
481				$delta{missing} = 1;
482				return (\%delta);
483			}
484			my $ivec = $cpst->{ivecs}{$inum};
485
486			# Create $delta{$cpu}{ivecs}{$inum}.
487
488			my %dltivec = ();
489			$delta{$cpu}{ivecs}{$inum} = \%dltivec;
490
491			# calculate time used by this interrupt
492
493			my $time = $newivec->{time} - $ivec->{time};
494			if (VERIFY($time >= 0,
495				   "generate_delta: ivec went backwards?")) {
496				$delta{missing} = 1;
497				delete($delta{$cpu}{ivecs}{$inum});
498				return (\%delta);
499			}
500			$delta{$cpu}{intrs} += $time;
501			$dltivec{time} = $time;
502			if ($time > $delta{$cpu}{bigintr}) {
503				$delta{$cpu}{bigintr} = $time;
504			}
505
506			# Transfer over basic info about the kstat. We
507			# don't have to worry about discrepancies between
508			# ivec and newivec because we verified that both
509			# have the same crtime.
510
511			$dltivec{pil} = $newivec->{pil};
512			$dltivec{ino} = $newivec->{ino};
513			$dltivec{buspath} = $newivec->{buspath};
514			$dltivec{name} = $newivec->{name};
515			$dltivec{ihs} = $newivec->{ihs};
516			$dltivec{num_ino} = $newivec->{num_ino};
517		}
518		if ($delta{$cpu}{tot} < $delta{$cpu}{intrs}) {
519			# Ewww! Hopefully just a rounding error.
520			# Make something up.
521			$delta{$cpu}{tot} = $delta{$cpu}{intrs};
522		}
523		$delta{$cpu}{intrload} =
524		       $delta{$cpu}{intrs} / $delta{$cpu}{tot};
525		$intrload += $delta{$cpu}{intrload};
526		$intrnsec += $delta{$cpu}{intrs};
527		$cpus++;
528	}
529	if ($cpus > 0) {
530		$delta{avgintrload} = $intrload / $cpus;
531		$delta{avgintrnsec} = $intrnsec / $cpus;
532	} else {
533		$delta{avgintrload} = 0;
534		$delta{avgintrnsec} = 0;
535	}
536	return (\%delta);
537}
538
539
540# compress_delta takes a list of deltas, and returns a single new delta
541# which represents the combined information from all the deltas. The deltas
542# provided are assumed to be sequential in time. The resulting compressed
543# delta looks just like any other delta. This new delta is also more accurate
544# since its statistics are averaged over a longer period than any of the
545# original deltas.
546
547sub compress_deltas ($)
548{
549	my ($deltas) = @_;
550
551	my %newdelta = ();
552	my ($intrs, $tot);
553	my $cpus = 0;
554	my ($high_intrload) = 0;
555
556	if (VERIFY($#$deltas != -1,
557		   "compress_deltas: list of delta is empty?")) {
558		return (0);
559	}
560	$newdelta{minsnap} = $deltas->[0]{minsnap};
561	$newdelta{maxsnap} = $deltas->[$#$deltas]{maxsnap};
562	$newdelta{missing} = 0;
563
564	foreach my $delta (@$deltas) {
565		if (VERIFY($delta->{missing} == 0,
566		    "compressing bad deltas?")) {
567			return (0);
568		}
569		while (my ($cpuid, $cpu) = each %$delta) {
570			next if !ref($cpu);
571
572			$intrs += $cpu->{intrs};
573			$tot += $cpu->{tot};
574			$newdelta{$cpuid}{intrs} += $cpu->{intrs};
575			$newdelta{$cpuid}{tot} += $cpu->{tot};
576			if (!exists $newdelta{$cpuid}{ivecs}) {
577				my %ivecs = ();
578				$newdelta{$cpuid}{ivecs} = \%ivecs;
579			}
580			while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
581				my $newivecs = $newdelta{$cpuid}{ivecs};
582				$newivecs->{$inum}{time} += $ivec->{time};
583				$newivecs->{$inum}{pil} = $ivec->{pil};
584				$newivecs->{$inum}{ino} = $ivec->{ino};
585				$newivecs->{$inum}{buspath} = $ivec->{buspath};
586				$newivecs->{$inum}{name} = $ivec->{name};
587				$newivecs->{$inum}{ihs} = $ivec->{ihs};
588				$newivecs->{$inum}{num_ino} = $ivec->{num_ino};
589			}
590		}
591	}
592	foreach my $cpu (values(%newdelta)) {
593		next if !ref($cpu); # ignore non-cpu fields
594		$cpus++;
595
596		my $bigintr = 0;
597		foreach my $ivec (values(%{$cpu->{ivecs}})) {
598			if ($ivec->{time} > $bigintr) {
599				$bigintr = $ivec->{time};
600			}
601		}
602		$cpu->{bigintr} = $bigintr;
603		$cpu->{intrload} = $cpu->{intrs} / $cpu->{tot};
604		if ($high_intrload < $cpu->{intrload}) {
605			$high_intrload = $cpu->{intrload};
606		}
607		$cpu->{tot} = 1 if $cpu->{tot} <= 0;
608	}
609	if ($cpus == 0) {
610		$newdelta{avgintrnsec} = 0;
611		$newdelta{avgintrload} = 0;
612	} else {
613		$newdelta{avgintrnsec} = $intrs / $cpus;
614		$newdelta{avgintrload} = $intrs / $tot;
615	}
616	$sleeptime = ($high_intrload < $idle_intrload) ? $idle_sleeptime :
617	    $normal_sleeptime;
618	return (\%newdelta);
619}
620
621
622
623
624
625# What follow are the core functions responsible for examining the deltas
626# generated above and deciding what to do about them.
627#
628# goodness() and its helper goodness_cpu() return a heuristic which describe
629# how good (or bad) the current interrupt balance is. The value returned will
630# be between 0 and 1, with 0 representing maximum goodness, and 1 representing
631# maximum badness.
632#
633# imbalanced() compares a current and historical value of goodness, and
634# determines if there has been enough change to warrant evaluating a
635# reconfiguration of the interrupts
636#
637# do_reconfig(), and its helpers, do_reconfig_cpu(), do_reconfig_cpu2cpu(),
638# find_goal(), do_find_goal(), and move_intr(), are responsible for examining
639# a delta and determining the best possible assignment of interrupts to CPUs.
640#
641# It is important that do_reconfig() be in alignment with goodness(). If
642# do_reconfig were to generate a new interrupt distribution that worsened
643# goodness, we could get into a pathological loop with intrd fighting itself,
644# constantly deciding that things are imbalanced, and then changing things
645# only to make them worse.
646
647
648
649# any goodness over $goodness_unsafe_load is considered really bad
650# goodness must drop by at least $goodness_mindelta for a reconfig
651
652my $goodness_unsafe_load = .9;
653my $goodness_mindelta = .1;
654
655# goodness(%delta) examines a delta and return its "goodness". goodness will
656# be between 0 (best) and 1 (major bad). goodness is determined by evaluating
657# the goodness of each individual cpu, and returning the worst case. This
658# helps on systems with many CPUs, where otherwise a single pathological CPU
659# might otherwise be ignored because the average was OK.
660#
661# To calculate the goodness of an individual CPU, we start by looking at its
662# load due to interrupts. If the load is above a certain high threshold and
663# there is more than one interrupt assigned to this CPU, we set goodness
664# to worst-case. If the load is below the average interrupt load of all CPUs,
665# then we return best-case, since what's to complain about?
666#
667# Otherwise we look at how much the load is above the average, and return
668# that as the goodness, with one caveat: we never return more than the CPU's
669# interrupt load ignoring its largest single interrupt source. This is
670# because a CPU with one high-load interrupt, and no other interrupts, is
671# perfectly balanced. Nothing can be done to improve the situation, and thus
672# it is perfectly balanced even if the interrupt's load is 100%.
673
674sub goodness($)
675{
676	my ($delta) = @_;
677
678	return (1) if $delta->{missing} > 0;
679
680	my $high_goodness = 0;
681	my $goodness;
682
683	foreach my $cpu (values(%$delta)) {
684		next if !ref($cpu);		# skip non-cpuid fields
685
686		$goodness = goodness_cpu($cpu, $delta->{avgintrload});
687		if (VERIFY($goodness >= 0 && $goodness <= 1,
688			   "goodness: cpu goodness out of range?")) {
689			dumpdelta($delta);
690			return (1);
691		}
692		if ($goodness == 1) {
693			return (1);	# worst case, no need to continue
694		}
695		if ($goodness > $high_goodness) {
696			$high_goodness = $goodness;
697		}
698	}
699	return ($high_goodness);
700}
701
702sub goodness_cpu($$)		# private function
703{
704	my ($cpu, $avgintrload) = @_;
705
706	my $goodness;
707	my $load = $cpu->{intrs} / $cpu->{tot};
708
709	return (0) if ($load < $avgintrload);	# low loads are perfectly good
710
711	# Calculate $load_no_bigintr, which represents the load
712	# due to interrupts, excluding the one biggest interrupt.
713	# This is the most gain we can get on this CPU from
714	# offloading interrupts.
715
716	my $load_no_bigintr = ($cpu->{intrs} - $cpu->{bigintr}) / $cpu->{tot};
717
718	# A major imbalance is indicated if a CPU is saturated
719	# with interrupt handling, and it has more than one
720	# source of interrupts. Those other interrupts could be
721	# starved if of a lower pil. Return a goodness of 1,
722	# which is the worst possible return value,
723	# which will effectively contaminate this entire delta.
724
725	my $cnt = keys(%{$cpu->{ivecs}});
726
727	if ($load > $goodness_unsafe_load && $cnt > 1) {
728		return (1);
729	}
730	$goodness = $load - $avgintrload;
731	if ($goodness > $load_no_bigintr) {
732		$goodness = $load_no_bigintr;
733	}
734	return ($goodness);
735}
736
737
738# imbalanced() is used by the main routine to determine if the goodness
739# has shifted far enough from our last baseline to warrant a reassignment
740# of interrupts. A very high goodness indicates that a CPU is way out of
741# whack. If the goodness has varied too much since the baseline, then
742# perhaps a reconfiguration is worth considering.
743
744sub imbalanced ($$)
745{
746	my ($goodness, $baseline) = @_;
747
748	# Return 1 if we are pathological, or creeping away from the baseline
749
750	return (1) if $goodness > .50;
751	return (1) if abs($goodness - $baseline) > $goodness_mindelta;
752	return (0);
753}
754
755# do_reconfig(), do_reconfig_cpu(), and do_reconfig_cpu2cpu(), are the
756# decision-making functions responsible for generating a new interrupt
757# distribution. They are designed with the definition of goodness() in
758# mind, i.e. they use the same definition of "good distribution" as does
759# goodness().
760#
761# do_reconfig() is responsible for deciding whether a redistribution is
762# actually warranted. If the goodness is already pretty good, it doesn't
763# waste the CPU time to generate a new distribution. If it
764# calculates a new distribution and finds that it is not sufficiently
765# improved from the prior distirbution, it will not do the redistribution,
766# mainly to avoid the disruption to system performance caused by
767# rejuggling interrupts.
768#
769# Its main loop works by going through a list of cpus sorted from
770# highest to lowest interrupt load. It removes the highest-load cpus
771# one at a time and hands them off to do_reconfig_cpu(). This function
772# then re-sorts the remaining CPUs from lowest to highest interrupt load,
773# and one at a time attempts to rejuggle interrupts between the original
774# high-load CPU and the low-load CPU. Rejuggling on a high-load CPU is
775# considered finished as soon as its interrupt load is within
776# $goodness_mindelta of the average interrupt load. Such a CPU will have
777# a goodness of below the $goodness_mindelta threshold.
778
779#
780# move_intr(\%delta, $inum, $oldcpu, $newcpu)
781# used by reconfiguration code to move an interrupt between cpus within
782# a delta. This manipulates data structures, and does not actually move
783# the interrupt on the running system.
784#
785sub move_intr($$$$)		# private function
786{
787	my ($delta, $inum, $oldcpuid, $newcpuid) = @_;
788
789	my $ivec = $delta->{$oldcpuid}{ivecs}{$inum};
790
791	# Remove ivec from old cpu
792
793	my $oldcpu = $delta->{$oldcpuid};
794	$oldcpu->{intrs} -= $ivec->{time};
795	$oldcpu->{intrload} = $oldcpu->{intrs} / $oldcpu->{tot};
796	delete($oldcpu->{ivecs}{$inum});
797
798	VERIFY($oldcpu->{intrs} >= 0, "move_intr: intr's time > total time?");
799	VERIFY($ivec->{time} <= $oldcpu->{bigintr},
800	       "move_intr: intr's time > bigintr?");
801
802	if ($ivec->{time} >= $oldcpu->{bigintr}) {
803		my $bigtime = 0;
804
805		foreach my $ivec (values(%{$oldcpu->{ivecs}})) {
806			$bigtime = $ivec->{time} if $ivec->{time} > $bigtime;
807		}
808		$oldcpu->{bigintr} = $bigtime;
809	}
810
811	# Add ivec onto new cpu
812
813	my $newcpu = $delta->{$newcpuid};
814
815	$ivec->{nowcpu} = $newcpuid;
816	$newcpu->{intrs} += $ivec->{time};
817	$newcpu->{intrload} = $newcpu->{intrs} / $newcpu->{tot};
818	$newcpu->{ivecs}{$inum} = $ivec;
819
820	$newcpu->{bigintr} = $ivec->{time}
821		if $ivec->{time} > $newcpu->{bigintr};
822}
823
824sub move_intr_check($$$)	# private function
825{
826	my ($delta, $oldcpuid, $newcpuid) = @_;
827
828	VERIFY($delta->{$oldcpuid}{tot} >= $delta->{$oldcpuid}{intrs},
829	       "Moved interrupts left 100+%% load on src cpu");
830	VERIFY($delta->{$newcpuid}{tot} >= $delta->{$newcpuid}{intrs},
831	       "Moved interrupts left 100+%% load on tgt cpu");
832}
833
834sub ivecs_to_string(@)		# private function
835{
836	my $str = "";
837	foreach my $ivec (@_) {
838		$str = "$str $ivec->{inum}";
839	}
840	return ($str);
841}
842
843
844sub do_reconfig($)
845{
846	my ($delta) = @_;
847
848	my $goodness = $delta->{goodness};
849
850	# We can't improve goodness to better than 0. We should stop here
851	# if, even if we achieve a goodness of 0, the improvement is still
852	# too small to merit the action.
853
854	if ($goodness - 0 < $goodness_mindelta) {
855		syslog('debug', "goodness good enough, don't reconfig");
856		return (0);
857	}
858
859	syslog('notice', "Optimizing interrupt assignments");
860
861	if (VERIFY ($delta->{missing} == 0, "RECONFIG Aborted: should not ".
862	    "have a delta with missing")) {
863		return (-1);
864	}
865
866	# Make a list of all cpuids, and also add some extra information
867	# to the ivec structures.
868
869	my @cpusortlist = ();
870
871	while (my ($cpuid, $cpu) = each %$delta) {
872		next if !ref($cpu);	# skip non-cpu entries
873
874		push(@cpusortlist, $cpuid);
875		while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
876			$ivec->{origcpu} = $cpuid;
877			$ivec->{nowcpu} = $cpuid;
878			$ivec->{inum} = $inum;
879		}
880	}
881
882	# Sort the list of CPUs from highest to lowest interrupt load.
883	# Remove the top CPU from that list and attempt to redistribute
884	# its interrupts. If the CPU has a goodness below a threshold,
885	# just ignore the CPU and move to the next one. If the CPU's
886	# load falls below the average load plus that same threshold,
887	# then there are no CPUs left worth reconfiguring, and we're done.
888
889	while (@cpusortlist) {
890		# Re-sort cpusortlist each time, since do_reconfig_cpu can
891		# move interrupts around.
892
893		@cpusortlist =
894		    sort({$delta->{$b}{intrload} <=> $delta->{$a}{intrload}}
895		    @cpusortlist);
896
897		my $cpu = shift(@cpusortlist);
898		if (($delta->{$cpu}{intrload} <= $goodness_unsafe_load) &&
899		    ($delta->{$cpu}{intrload} <=
900		    $delta->{avgintrload} + $goodness_mindelta)) {
901			syslog('debug', "finished reconfig: cpu $cpu load ".
902			    "$delta->{$cpu}{intrload} avgload ".
903			    "$delta->{avgintrload}");
904			last;
905		}
906		if (goodness_cpu($delta->{$cpu}, $delta->{avgintrload}) <
907		    $goodness_mindelta) {
908			next;
909		}
910		do_reconfig_cpu($delta, \@cpusortlist, $cpu);
911	}
912
913	# How good a job did we do? If the improvement was minimal, and
914	# our goodness wasn't pathological (and thus needing any help it
915	# can get), then don't bother moving the interrupts.
916
917	my $newgoodness = goodness($delta);
918	VERIFY($newgoodness <= $goodness,
919	       "reconfig: result has worse goodness?");
920
921	if (($goodness != 1 || $newgoodness == 1) &&
922	    $goodness - $newgoodness < $goodness_mindelta) {
923		syslog('debug', "goodness already near optimum, ".
924		       "don't reconfig");
925		return (0);
926	}
927	syslog('debug', "goodness %5.2f%% --> %5.2f%%", $goodness*100,
928	       $newgoodness*100);
929
930	# Time to move those interrupts!
931
932	my $ret = 1;
933	my $warned = 0;
934	while (my ($cpuid, $cpu) = each %$delta) {
935		next if $cpuid =~ /\D/;
936		while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
937			next if ($ivec->{origcpu} == $cpuid);
938
939			if (!intrmove($ivec->{buspath}, $ivec->{origcpu},
940			    $ivec->{ino}, $cpuid, $ivec->{num_ino})) {
941				syslog('warning', "Unable to move interrupts")
942				    if $warned++ == 0;
943				syslog('debug', "Unable to move buspath ".
944				    "$ivec->{buspath} ino $ivec->{ino} to ".
945				    "cpu $cpuid");
946				$ret = -1;
947			}
948		}
949	}
950
951	syslog('notice', "Interrupt assignments optimized");
952	return ($ret);
953}
954
955sub do_reconfig_cpu($$$)	# private function
956{
957	my ($delta, $cpusortlist, $oldcpuid) = @_;
958
959	# We have been asked to rejuggle interrupts between $oldcpuid and
960	# other CPUs found on $cpusortlist so as to improve the load on
961	# $oldcpuid. We reverse $cpusortlist to get our own copy of the
962	# list, sorted from lowest to highest interrupt load. One at a
963	# time, shift a CPU off of this list of CPUs, and attempt to
964	# rejuggle interrupts between the two CPUs. Don't do this if the
965	# other CPU has a higher load than oldcpuid. We're done rejuggling
966	# once $oldcpuid's goodness falls below a threshold.
967
968	syslog('debug', "reconfiguring $oldcpuid");
969
970	my $cpu = $delta->{$oldcpuid};
971	my $avgintrload = $delta->{avgintrload};
972
973	my @cputargetlist = reverse(@$cpusortlist); # make a copy of the list
974	while ($#cputargetlist != -1) {
975 		last if goodness_cpu($cpu, $avgintrload) < $goodness_mindelta;
976
977		my $tgtcpuid = shift(@cputargetlist);
978		my $tgt = $delta->{$tgtcpuid};
979		my $load = $cpu->{intrload};
980		my $tgtload = $tgt->{intrload};
981		last if $tgtload > $load;
982		do_reconfig_cpu2cpu($delta, $oldcpuid, $tgtcpuid, $load);
983	}
984}
985
986sub do_reconfig_cpu2cpu($$$$)	# private function
987{
988	my ($delta, $srccpuid, $tgtcpuid, $srcload) = @_;
989
990	# We've been asked to consider interrupt juggling between srccpuid
991	# (with a high interrupt load) and tgtcpuid (with a lower interrupt
992	# load). First, make a single list with all of the ivecs from both
993	# CPUs, and sort the list from highest to lowest load.
994
995	syslog('debug', "exchanging intrs between $srccpuid and $tgtcpuid");
996
997	# Gather together all the ivecs and sort by load
998
999	my @ivecs = (values(%{$delta->{$srccpuid}{ivecs}}),
1000	    values(%{$delta->{$tgtcpuid}{ivecs}}));
1001	return if $#ivecs == -1;
1002
1003	@ivecs = sort({$b->{time} <=> $a->{time}} @ivecs);
1004
1005	# Our "goal" load for srccpuid is the average load across all CPUs.
1006	# find_goal() will find determine the optimum selection of the
1007	# available interrupts which comes closest to this goal without
1008	# falling below the goal.
1009
1010	my $goal = $delta->{avgintrnsec};
1011
1012	# We know that the interrupt load on tgtcpuid is less than that on
1013	# srccpuid, but its load could still be above avgintrnsec. Don't
1014	# choose a goal which would bring srccpuid below the load on tgtcpuid.
1015
1016	my $avgnsec =
1017	    ($delta->{$srccpuid}{intrs} + $delta->{$tgtcpuid}{intrs}) / 2;
1018	if ($goal < $avgnsec) {
1019		$goal = $avgnsec;
1020	}
1021
1022	# If the largest of the interrupts is on srccpuid, leave it there.
1023	# This can help minimize the disruption caused by moving interrupts.
1024
1025	if ($ivecs[0]->{origcpu} == $srccpuid) {
1026		syslog('debug', "Keeping $ivecs[0]->{inum} on $srccpuid");
1027		$goal -= $ivecs[0]->{time};
1028		shift(@ivecs);
1029	}
1030
1031	syslog('debug', "GOAL: inums should total $goal");
1032	find_goal(\@ivecs, $goal);
1033
1034	# find_goal() returned its results to us by setting $ivec->{goal} if
1035	# the ivec should be on srccpuid, or clearing it for tgtcpuid.
1036	# Call move_intr() to update our $delta with the new results.
1037
1038	foreach my $ivec (@ivecs) {
1039		syslog('debug', "ivec $ivec->{inum} goal $ivec->{goal}");
1040		VERIFY($ivec->{nowcpu} == $srccpuid ||
1041		    $ivec->{nowcpu} == $tgtcpuid, "cpu2cpu found an ".
1042		    "interrupt not currently on src or tgt cpu");
1043
1044		if ($ivec->{goal} && $ivec->{nowcpu} != $srccpuid) {
1045			move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
1046			    $srccpuid);
1047		} elsif ($ivec->{goal} == 0 && $ivec->{nowcpu} != $tgtcpuid) {
1048			move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
1049			    $tgtcpuid);
1050		}
1051	}
1052	move_intr_check($delta, $srccpuid, $tgtcpuid); # asserts
1053
1054	my $newload = $delta->{$srccpuid}{intrs} / $delta->{$srccpuid}{tot};
1055	VERIFY($newload <= $srcload && $newload > $delta->{avgintrload},
1056	    "cpu2cpu: new load didn't end up in expected range");
1057}
1058
1059
1060# find_goal() and its helper do_find_goal() are used to find the best
1061# combination of interrupts in order to generate a load that is as close
1062# as possible to a goal load without falling below that goal. Before returning
1063# to its caller, find_goal() sets a new value in the hash of each interrupt,
1064# {goal}, which if set signifies that this interrupt is one of the interrupts
1065# identified as part of the set of interrupts which best meet the goal.
1066#
1067# The arguments to find_goal are a list of ivecs (hash references), sorted
1068# by descending {time}, and the goal load. The goal is relative to {time}.
1069# The best fit is determined by performing a depth-first search. do_find_goal
1070# is the recursive subroutine which carries out the search.
1071#
1072# It is passed an index as an argument, originally 0. On a given invocation,
1073# it is only to consider interrupts in the ivecs array starting at that index.
1074# It then considers two possibilities:
1075#   1) What is the best goal-fit if I include ivecs[index]?
1076#   2) What is the best goal-fit if I exclude ivecs[index]?
1077# To determine case 1, it subtracts the load of ivecs[index] from the goal,
1078# and calls itself recursively with that new goal and index++.
1079# To determine case 2, it calls itself recursively with the same goal and
1080# index++.
1081#
1082# It then compares the two results, decide which one best meets the goals,
1083# and returns the result. The return value is the best-fit's interrupt load,
1084# followed by a list of all the interrupts which make up that best-fit.
1085#
1086# As an optimization, a second array loads[] is created which mirrors ivecs[].
1087# loads[i] will equal the total loads of all ivecs[i..$#ivecs]. This is used
1088# by do_find_goal to avoid recursing all the way to the end of the ivecs
1089# array if including all remaining interrupts will still leave the best-fit
1090# at below goal load. If so, it then includes all remaining interrupts on
1091# the goal list and returns.
1092#
1093sub find_goal($$)		# private function
1094{
1095	my ($ivecs, $goal) = @_;
1096
1097	my @goals;
1098	my $load;
1099	my $ivec;
1100
1101	if ($goal <= 0) {
1102		@goals = ();	# the empty set will best meet the goal
1103	} else {
1104		syslog('debug', "finding goal from intrs %s",
1105		    ivecs_to_string(@$ivecs));
1106
1107		# Generate @loads array
1108
1109		my $tot = 0;
1110		foreach $ivec (@$ivecs) {
1111			$tot += $ivec->{time};
1112		}
1113		my @loads = ();
1114		foreach $ivec (@$ivecs) {
1115			push(@loads, $tot);
1116			$tot -= $ivec->{time};
1117		}
1118		($load, @goals) = do_find_goal($ivecs, \@loads, $goal, 0);
1119		VERIFY($load >= $goal, "find_goal didn't meet goals");
1120	}
1121	syslog('debug', "goals found: %s", ivecs_to_string(@goals));
1122
1123	# Set or clear $ivec->{goal} for each ivec, based on returned @goals
1124
1125	foreach $ivec (@$ivecs) {
1126		if ($#goals > -1 && $ivec == $goals[0]) {
1127			syslog('debug', "inum $ivec->{inum} on source cpu");
1128			$ivec->{goal} = 1;
1129			shift(@goals);
1130		} else {
1131			syslog('debug', "inum $ivec->{inum} on target cpu");
1132			$ivec->{goal} = 0;
1133		}
1134	}
1135}
1136
1137
1138sub do_find_goal($$$$)		# private function
1139{
1140	my ($ivecs, $loads, $goal, $idx) = @_;
1141
1142	if ($idx > $#{$ivecs}) {
1143		return (0);
1144	}
1145	syslog('debug', "$idx: finding goal $goal inum $ivecs->[$idx]{inum}");
1146
1147	my $load = $ivecs->[$idx]{time};
1148	my @goals_with = ();
1149	my @goals_without = ();
1150	my ($with, $without);
1151
1152	# If we include all remaining items and we're still below goal,
1153	# stop here. We can just return a result that includes $idx and all
1154	# subsequent ivecs. Since this will still be below goal, there's
1155	# nothing better to be done.
1156
1157	if ($loads->[$idx] <= $goal) {
1158		syslog('debug',
1159		    "$idx: including all remaining intrs %s with load %d",
1160		    ivecs_to_string(@$ivecs[$idx .. $#{$ivecs}]),
1161		    $loads->[$idx]);
1162		return ($loads->[$idx], @$ivecs[$idx .. $#{$ivecs}]);
1163	}
1164
1165	# Evaluate the "with" option, i.e. the best matching goal which
1166	# includes $ivecs->[$idx]. If idx's load is more than our goal load,
1167	# stop here. Once we're above the goal, there is no need to consider
1168	# further interrupts since they'll only take us further from the goal.
1169
1170	if ($goal <= $load) {
1171		$with = $load;	# stop here
1172	} else {
1173		($with, @goals_with) =
1174		    do_find_goal($ivecs, $loads, $goal - $load, $idx + 1);
1175		$with += $load;
1176	}
1177	syslog('debug', "$idx: with-load $with intrs %s",
1178	       ivecs_to_string($ivecs->[$idx], @goals_with));
1179
1180	# Evaluate the "without" option, i.e. the best matching goal which
1181	# excludes $ivecs->[$idx].
1182
1183	($without, @goals_without) =
1184	    &do_find_goal($ivecs, $loads, $goal, $idx + 1);
1185	syslog('debug', "$idx: without-load $without intrs %s",
1186	       ivecs_to_string(@goals_without));
1187
1188	# We now have our "with" and "without" options, and we choose which
1189	# best fits the goal. If one is greater than goal and the other is
1190	# below goal, we choose the one that is greater. If they are both
1191	# below goal, then we choose the one that is greater. If they are
1192	# both above goal, then we choose the smaller.
1193
1194	my $which;		# 0 == with, 1 == without
1195	if ($with >= $goal && $without < $goal) {
1196		$which = 0;
1197	} elsif ($with < $goal && $without >= $goal) {
1198		$which = 1;
1199	} elsif ($with >= $goal && $without >= $goal) {
1200		$which = ($without < $with);
1201	} else {
1202		$which = ($without > $with);
1203	}
1204
1205	# Return the load of our best case scenario, followed by all the ivecs
1206	# which compose that goal.
1207
1208	if ($which == 1) {	# without
1209		syslog('debug', "$idx: going without");
1210		return ($without, @goals_without);
1211	} else {
1212		syslog('debug', "$idx: going with");
1213		return ($with, $ivecs->[$idx], @goals_with);
1214	}
1215	# Not reached
1216}
1217
1218
1219
1220
1221syslog('debug', "intrd is starting".($debug ? " (debug)" : ""));
1222
1223my @deltas = ();
1224my $deltas_tottime = 0;		# sum of maxsnap-minsnap across @deltas
1225my $avggoodness;
1226my $baseline_goodness = 0;
1227my $compdelta;
1228
1229my $do_reconfig;
1230
1231# temp variables
1232my $goodness;
1233my $deltatime;
1234my $olddelta;
1235my $olddeltatime;
1236my $delta;
1237my $newstat;
1238my $below_statslen;
1239my $newtime;
1240my $ret;
1241
1242
1243my $gotsig = 0;
1244$SIG{INT} = sub { $gotsig = 1; };     # don't die in the middle of retargeting
1245$SIG{HUP} = $SIG{INT};
1246$SIG{TERM} = $SIG{INT};
1247
1248my $ks;
1249if ($using_scengen == 0) {
1250	$ks = Sun::Solaris::Kstat->new();
1251} else {
1252	$ks = myks_update();	# supplied by the simulator
1253}
1254
1255# If no pci_intrs kstats were found, we need to exit, but we can't because
1256# SMF will restart us and/or report an error to the administrator. But
1257# there's nothing an administrator can do. So print out a message for SMF
1258# logs and silently pause forever.
1259
1260if (!exists($ks->{pci_intrs})) {
1261	print STDERR "$cmdname: no interrupts were found; ".
1262	    "your PCI bus may not yet be supported\n";
1263	pause() while $gotsig == 0;
1264	exit 0;
1265}
1266
1267# See if this is a system with a pcplusmp APIC.
1268# Such systems will get special handling.
1269# Assume that if one bus has a pcplusmp APIC that they all do.
1270
1271# Get a list of pci_intrs kstats.
1272my @elem = values(%{$ks->{pci_intrs}});
1273my $elem0 = $elem[0];
1274my $elemval = (values(%$elem0))[0];
1275
1276# Use its buspath to query the system.  It is assumed that either all or none
1277# of the busses on a system are hosted by the pcplusmp APIC or APIX.
1278my $pcplusmp_sys = is_apic($elemval->{buspath});
1279
1280my $stat = getstat($ks, $pcplusmp_sys);
1281
1282for (;;) {
1283	sub clear_deltas {
1284		@deltas = ();
1285		$deltas_tottime = 0;
1286		$stat = 0;   # prevent next gen_delta() from setting {missing}
1287	}
1288
1289	# 1. Sleep, update the kstats, and save the new stats in $newstat.
1290
1291	exit 0 if $gotsig;		# if we got ^C / SIGTERM, exit
1292	if ($using_scengen == 0) {
1293		sleep($sleeptime);
1294		exit 0 if $gotsig;	# if we got ^C / SIGTERM, exit
1295		$ks->update();
1296	} else {
1297		$ks = myks_update();
1298	}
1299	$newstat = getstat($ks, $pcplusmp_sys);
1300
1301	# $stat or $newstat could be zero if they're uninitialized, or if
1302	# getstat() failed. If $stat is zero, move $newstat to $stat, sleep
1303	# and try again. If $newstat is zero, then we also sleep and try
1304	# again, hoping the problem will clear up.
1305
1306	next if (!ref $newstat);
1307	if (!ref $stat) {
1308		$stat = $newstat;
1309		next;
1310	}
1311
1312	# 2. Compare $newstat with the prior set of values, result in %$delta.
1313
1314	$delta = generate_delta($stat, $newstat);
1315	dumpdelta($delta) if $debug;	# Dump most recent stats to stdout.
1316	$stat = $newstat;	# The new stats now become the old stats.
1317
1318
1319	# 3. If $delta->{missing}, then there has been a reconfiguration of
1320	# either cpus or interrupts (probably both). We need to toss out our
1321	# old set of statistics and start from scratch.
1322	#
1323	# Also, if the delta covers a very long range of time, then we've
1324	# been experiencing a system overload that has resulted in intrd
1325	# not being allowed to run effectively for a while now. As above,
1326	# toss our old statistics and start from scratch.
1327
1328	$deltatime = $delta->{maxsnap} - $delta->{minsnap};
1329	if ($delta->{missing} > 0 || $deltatime > $statslen) {
1330		clear_deltas();
1331		syslog('debug', "evaluating interrupt assignments");
1332		next;
1333	}
1334
1335
1336	# 4. Incorporate new delta into the list of deltas, and associated
1337	# statistics. If we've just now received $statslen deltas, then it's
1338	# time to evaluate a reconfiguration.
1339
1340	$below_statslen = ($deltas_tottime < $statslen);
1341	$deltas_tottime += $deltatime;
1342	$do_reconfig = ($below_statslen && $deltas_tottime >= $statslen);
1343	push(@deltas, $delta);
1344
1345	# 5. Remove old deltas if total time is more than $statslen. We use
1346	# @deltas as a moving average of the last $statslen seconds. Shift
1347	# off the olders deltas, but only if that doesn't cause us to fall
1348	# below $statslen seconds.
1349
1350	while (@deltas > 1) {
1351		$olddelta = $deltas[0];
1352		$olddeltatime = $olddelta->{maxsnap} - $olddelta->{minsnap};
1353		$newtime = $deltas_tottime - $olddeltatime;
1354		last if ($newtime < $statslen);
1355
1356		shift(@deltas);
1357		$deltas_tottime = $newtime;
1358	}
1359
1360	# 6. The brains of the operation are here. First, check if we're
1361	# imbalanced, and if so set $do_reconfig. If $do_reconfig is set,
1362	# either because of imbalance or above in step 4, we evaluate a
1363	# new configuration.
1364	#
1365	# First, take @deltas and generate a single "compressed" delta
1366	# which summarizes them all. Pass that to do_reconfig and see
1367	# what it does with it:
1368	#
1369	# $ret == -1 : failure
1370	# $ret ==  0 : current config is optimal (or close enough)
1371	# $ret ==  1 : reconfiguration has occurred
1372	#
1373	# If $ret is -1 or 1, dump all our deltas and start from scratch.
1374	# Step 4 above will set do_reconfig soon thereafter.
1375	#
1376	# If $ret is 0, then nothing has happened because we're already
1377	# good enough. Set baseline_goodness to current goodness.
1378
1379	$compdelta = compress_deltas(\@deltas);
1380	if (VERIFY(ref($compdelta) eq "HASH", "couldn't compress deltas")) {
1381		clear_deltas();
1382		next;
1383	}
1384	$compdelta->{goodness} = goodness($compdelta);
1385	dumpdelta($compdelta) if $debug;
1386
1387	$goodness = $compdelta->{goodness};
1388	syslog('debug', "GOODNESS: %5.2f%%", $goodness * 100);
1389
1390	if ($deltas_tottime >= $statslen &&
1391	    imbalanced($goodness, $baseline_goodness)) {
1392		$do_reconfig = 1;
1393	}
1394
1395	if ($do_reconfig) {
1396		$ret = do_reconfig($compdelta);
1397
1398		if ($ret != 0) {
1399			clear_deltas();
1400			syslog('debug', "do_reconfig FAILED!") if $ret == -1;
1401		} else {
1402			syslog('debug', "setting new baseline of $goodness");
1403			$baseline_goodness = $goodness;
1404		}
1405	}
1406	syslog('debug', "---------------------------------------");
1407}
1408