1bd335c64Sesolom#!/usr/perl5/bin/perl 2bd335c64Sesolom# 3bd335c64Sesolom# CDDL HEADER START 4bd335c64Sesolom# 5bd335c64Sesolom# The contents of this file are subject to the terms of the 6d89fccd8Sschwartz# Common Development and Distribution License (the "License"). 7d89fccd8Sschwartz# You may not use this file except in compliance with the License. 8bd335c64Sesolom# 9bd335c64Sesolom# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10bd335c64Sesolom# or http://www.opensolaris.org/os/licensing. 11bd335c64Sesolom# See the License for the specific language governing permissions 12bd335c64Sesolom# and limitations under the License. 13bd335c64Sesolom# 14bd335c64Sesolom# When distributing Covered Code, include this CDDL HEADER in each 15bd335c64Sesolom# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16bd335c64Sesolom# If applicable, add the following below this CDDL HEADER, with the 17bd335c64Sesolom# fields enclosed by brackets "[]" replaced with your own identifying 18bd335c64Sesolom# information: Portions Copyright [yyyy] [name of copyright owner] 19bd335c64Sesolom# 20bd335c64Sesolom# CDDL HEADER END 21bd335c64Sesolom# 22bd335c64Sesolom 23bd335c64Sesolom# 24*5cd376e8SJimmy Vetayases# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25bd335c64Sesolom# 26bd335c64Sesolom 27df0345f7SJohn Sonnenscheinrequire 5.8.4; 28bd335c64Sesolomuse strict; 29bd335c64Sesolomuse warnings; 30bd335c64Sesolomuse POSIX; 31bd335c64Sesolomuse File::Basename("basename"); 32bd335c64Sesolom 33bd335c64Sesolommy $cmdname = basename($0); 34bd335c64Sesolom 35bd335c64Sesolommy $using_scengen = 0; # 1 if using scenario simulator 36bd335c64Sesolommy $debug = 0; 37bd335c64Sesolom 389e59f930Sesolommy $normal_sleeptime = 10; # time to sleep between samples 399e59f930Sesolommy $idle_sleeptime = 45; # time to sleep when idle 40bd335c64Sesolommy $onecpu_sleeptime = (60 * 15); # used if only 1 CPU on system 419e59f930Sesolommy $sleeptime = $normal_sleeptime; # either normal_ or idle_ or onecpu_ 42bd335c64Sesolom 439e59f930Sesolommy $idle_intrload = .1; # idle if interrupt load < 10% 44bd335c64Sesolom 45bd335c64Sesolommy $timerange_toohi = .01; 46bd335c64Sesolommy $statslen = 60; # time period (in secs) to keep in @deltas 47bd335c64Sesolom 48bd335c64Sesolom 49bd335c64Sesolom# Parse arguments. intrd does not accept any public arguments; the two 50bd335c64Sesolom# arguments below are meant for testing purposes. -D generates a significant 51bd335c64Sesolom# amount of syslog output. -S <filename> loads the filename as a perl 52bd335c64Sesolom# script. That file is expected to implement a kstat "simulator" which 53bd335c64Sesolom# can be used to feed information to intrd and verify intrd's responses. 54bd335c64Sesolom 55bd335c64Sesolomwhile ($_ = shift @ARGV) { 56bd335c64Sesolom if ($_ eq "-S" && $#ARGV != -1) { 57bd335c64Sesolom $using_scengen = 1; 58bd335c64Sesolom do $ARGV[0]; # load simulator 59bd335c64Sesolom shift @ARGV; 60bd335c64Sesolom } elsif ($_ eq "-D") { 61bd335c64Sesolom $debug = 1; 62bd335c64Sesolom } 63bd335c64Sesolom} 64bd335c64Sesolom 65bd335c64Sesolomif ($using_scengen == 0) { 66bd335c64Sesolom require Sun::Solaris::Kstat; 67bd335c64Sesolom require Sun::Solaris::Intrs; 687ff178cdSJimmy Vetayases import Sun::Solaris::Intrs(qw(intrmove is_apic)); 69bd335c64Sesolom require Sys::Syslog; 70bd335c64Sesolom import Sys::Syslog; 71bd335c64Sesolom openlog($cmdname, 'pid', 'daemon'); 72bd335c64Sesolom setlogmask(Sys::Syslog::LOG_UPTO($debug > 0 ? &Sys::Syslog::LOG_DEBUG : 73bd335c64Sesolom &Sys::Syslog::LOG_INFO)); 74bd335c64Sesolom} 75bd335c64Sesolom 76bd335c64Sesolommy $asserted = 0; 77bd335c64Sesolommy $assert_level = 'debug'; # syslog level for assertion failures 78bd335c64Sesolomsub VERIFY($@) 79bd335c64Sesolom{ 80bd335c64Sesolom my $bad = (shift() == 0); # $_[0] == 0 means assert failed 81bd335c64Sesolom if ($bad) { 82bd335c64Sesolom my $msg = shift(); 83bd335c64Sesolom syslog($assert_level, "VERIFY: $msg", @_); 84bd335c64Sesolom $asserted++; 85bd335c64Sesolom } 86bd335c64Sesolom return ($bad); 87bd335c64Sesolom} 88bd335c64Sesolom 89bd335c64Sesolom 90bd335c64Sesolom 91bd335c64Sesolom 922917a9c9Sschwartzsub getstat($$); 93bd335c64Sesolomsub generate_delta($$); 94bd335c64Sesolomsub compress_deltas($); 95bd335c64Sesolomsub dumpdelta($); 96bd335c64Sesolom 97bd335c64Sesolomsub goodness($); 98bd335c64Sesolomsub imbalanced($$); 99bd335c64Sesolomsub do_reconfig($); 100bd335c64Sesolom 101bd335c64Sesolomsub goodness_cpu($$); # private function 102bd335c64Sesolomsub move_intr($$$$); # private function 103bd335c64Sesolomsub ivecs_to_string(@); # private function 104bd335c64Sesolomsub do_find_goal($$$$); # private function 105bd335c64Sesolomsub find_goal($$); # private function 106bd335c64Sesolomsub do_reconfig_cpu2cpu($$$$); # private function 107bd335c64Sesolomsub do_reconfig_cpu($$$); # private function 108bd335c64Sesolom 109bd335c64Sesolom 110bd335c64Sesolom# 111bd335c64Sesolom# What follow are the basic data structures routines of intrd. 112bd335c64Sesolom# 113bd335c64Sesolom# getstat() is responsible for reading the kstats and generating a "stat" hash. 114bd335c64Sesolom# 115bd335c64Sesolom# generate_delta() is responsible for taking two "stat" hashes and creating 116bd335c64Sesolom# a new "delta" hash that represents what has changed over time. 117bd335c64Sesolom# 118bd335c64Sesolom# compress_deltas() is responsible for taking a list of deltas and generating 119bd335c64Sesolom# a single delta hash that encompasses all the time periods described by the 120bd335c64Sesolom# deltas. 121bd335c64Sesolom 122bd335c64Sesolom 123bd335c64Sesolom# 124bd335c64Sesolom# getstat() is handed a reference to a kstat and generates a hash, returned 125bd335c64Sesolom# by reference, containing all the fields from the kstats which we need. 126bd335c64Sesolom# If it returns the scalar 0, it failed to gather the kstats, and the caller 127bd335c64Sesolom# should react accordingly. 128bd335c64Sesolom# 129bd335c64Sesolom# getstat() is also responsible for maintaining a reasonable $sleeptime. 130bd335c64Sesolom# 131bd335c64Sesolom# {"snaptime"} kstat's snaptime 132bd335c64Sesolom# {<cpuid>} one hash reference per online cpu 133bd335c64Sesolom# ->{"tot"} == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle} 134bd335c64Sesolom# ->{"crtime"} == cpu:<cpuid>:sys:crtime 135bd335c64Sesolom# ->{"ivecs"} 136d89fccd8Sschwartz# ->{<cookie#>} iterates over pci_intrs::<nexus>:cookie 137d89fccd8Sschwartz# ->{"time"} == pci_intrs:<ivec#>:<nexus>:time (in nsec) 138d89fccd8Sschwartz# ->{"pil"} == pci_intrs:<ivec#>:<nexus>:pil 139d89fccd8Sschwartz# ->{"crtime"} == pci_intrs:<ivec#>:<nexus>:crtime 140d89fccd8Sschwartz# ->{"ino"} == pci_intrs:<ivec#>:<nexus>:ino 1412917a9c9Sschwartz# ->{"num_ino"} == num inos of single device instance sharing this entry 1422917a9c9Sschwartz# Will be > 1 on pcplusmp X86 systems for devices 1432917a9c9Sschwartz# with multiple MSI interrupts. 144d89fccd8Sschwartz# ->{"buspath"} == pci_intrs:<ivec#>:<nexus>:buspath 145d89fccd8Sschwartz# ->{"name"} == pci_intrs:<ivec#>:<nexus>:name 146d89fccd8Sschwartz# ->{"ihs"} == pci_intrs:<ivec#>:<nexus>:ihs 147bd335c64Sesolom# 148bd335c64Sesolom 1492917a9c9Sschwartzsub getstat($$) 150bd335c64Sesolom{ 1512917a9c9Sschwartz my ($ks, $pcplusmp_sys) = @_; 152bd335c64Sesolom 153bd335c64Sesolom my $cpucnt = 0; 154bd335c64Sesolom my %stat = (); 155bd335c64Sesolom my ($minsnap, $maxsnap); 156bd335c64Sesolom 1572917a9c9Sschwartz # Hash of hash which matches (MSI device, ino) combos to kstats. 1582917a9c9Sschwartz my %msidevs = (); 1592917a9c9Sschwartz 160bd335c64Sesolom # kstats are not generated atomically. Each kstat hierarchy will 161bd335c64Sesolom # have been generated within the kernel at a different time. On a 162bd335c64Sesolom # thrashing system, we may not run quickly enough in order to get 163bd335c64Sesolom # coherent kstat timing information across all the kstats. To 164bd335c64Sesolom # determine if this is occurring, $minsnap/$maxsnap are used to 165bd335c64Sesolom # find the breadth between the first and last snaptime of all the 166bd335c64Sesolom # kstats we access. $maxsnap - $minsnap roughly represents the 167bd335c64Sesolom # total time taken up in getstat(). If this time approaches the 168bd335c64Sesolom # time between snapshots, our results may not be useful. 169bd335c64Sesolom 170bd335c64Sesolom $minsnap = -1; # snaptime is always a positive number 171bd335c64Sesolom $maxsnap = $minsnap; 172bd335c64Sesolom 173bd335c64Sesolom # Iterate over the cpus in cpu:<cpuid>::. Check 174bd335c64Sesolom # cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the 175bd335c64Sesolom # processor is "on-line". If not, it isn't accepting interrupts 176bd335c64Sesolom # and doesn't concern us. 177bd335c64Sesolom # 178bd335c64Sesolom # Record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap. 179bd335c64Sesolom 180bd335c64Sesolom while (my ($cpu, $cpst) = each %{$ks->{cpu}}) { 181bd335c64Sesolom next if !exists($ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state}); 1822917a9c9Sschwartz #"state" fld of kstat w/ 1832917a9c9Sschwartz # modname inst name-"cpuinfo0" 184bd335c64Sesolom my $state = $ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state}; 185bd335c64Sesolom next if ($state !~ /^on-line\0/); 186bd335c64Sesolom my $cpu_sys = $cpst->{sys}; 187bd335c64Sesolom 188bd335c64Sesolom $stat{$cpu}{tot} = ($cpu_sys->{cpu_nsec_idle} + 189bd335c64Sesolom $cpu_sys->{cpu_nsec_user} + 190bd335c64Sesolom $cpu_sys->{cpu_nsec_kernel}); 191bd335c64Sesolom $stat{$cpu}{crtime} = $cpu_sys->{crtime}; 192bd335c64Sesolom $stat{$cpu}{ivecs} = {}; 193bd335c64Sesolom 194bd335c64Sesolom if ($minsnap == -1 || $cpu_sys->{snaptime} < $minsnap) { 195bd335c64Sesolom $minsnap = $cpu_sys->{snaptime}; 196bd335c64Sesolom } 197bd335c64Sesolom if ($cpu_sys->{snaptime} > $maxsnap) { 198bd335c64Sesolom $maxsnap = $cpu_sys->{snaptime}; 199bd335c64Sesolom } 200bd335c64Sesolom $cpucnt++; 201bd335c64Sesolom } 202bd335c64Sesolom 203bd335c64Sesolom if ($cpucnt <= 1) { 204bd335c64Sesolom $sleeptime = $onecpu_sleeptime; 205bd335c64Sesolom return (0); # nothing to do with 1 CPU 206bd335c64Sesolom } 207bd335c64Sesolom 208bd335c64Sesolom # Iterate over the ivecs. If the cpu is not on-line, ignore the 209bd335c64Sesolom # ivecs mapped to it, if any. 210bd335c64Sesolom # 211d89fccd8Sschwartz # Record pci_intrs:{inum}:<nexus>:time, snaptime, crtime, pil, 212bd335c64Sesolom # ino, name, and buspath. Check $minsnap/$maxsnap. 213bd335c64Sesolom 214bd335c64Sesolom foreach my $inst (values(%{$ks->{pci_intrs}})) { 215d89fccd8Sschwartz my $intrcfg = (values(%$inst))[0]; 216bd335c64Sesolom my $cpu = $intrcfg->{cpu}; 217bd335c64Sesolom 218bd335c64Sesolom next unless exists $stat{$cpu}; 219e1d9f4e6Sschwartz next if ($intrcfg->{type} =~ /^disabled\0/); 220bd335c64Sesolom 2212917a9c9Sschwartz # Perl looks beyond NULL chars in pattern matching. 2222917a9c9Sschwartz # Truncate name field at the first NULL 2232917a9c9Sschwartz $intrcfg->{name} =~ s/\0.*$//; 2242917a9c9Sschwartz 225bd335c64Sesolom if ($intrcfg->{snaptime} < $minsnap) { 226bd335c64Sesolom $minsnap = $intrcfg->{snaptime}; 227bd335c64Sesolom } elsif ($intrcfg->{snaptime} > $maxsnap) { 228bd335c64Sesolom $maxsnap = $intrcfg->{snaptime}; 229bd335c64Sesolom } 230bd335c64Sesolom 231bd335c64Sesolom my $cookie = "$intrcfg->{buspath} $intrcfg->{ino}"; 232bd335c64Sesolom if (exists $stat{$cpu}{ivecs}{$cookie}) { 233bd335c64Sesolom my $cookiestats = $stat{$cpu}{ivecs}{$cookie}; 234bd335c64Sesolom 235bd335c64Sesolom $cookiestats->{time} += $intrcfg->{time}; 236bd335c64Sesolom $cookiestats->{name} .= "/$intrcfg->{name}"; 237bd335c64Sesolom 238bd335c64Sesolom # If this new interrupt sharing $cookie represents a 239bd335c64Sesolom # change from an earlier getstat, make sure that 240bd335c64Sesolom # generate_delta will see the change by setting 241bd335c64Sesolom # crtime to the most recent crtime of its components. 242bd335c64Sesolom 243bd335c64Sesolom if ($intrcfg->{crtime} > $cookiestats->{crtime}) { 244bd335c64Sesolom $cookiestats->{crtime} = $intrcfg->{crtime}; 245bd335c64Sesolom } 246bd335c64Sesolom $cookiestats->{ihs}++; 247bd335c64Sesolom next; 248bd335c64Sesolom } 249bd335c64Sesolom $stat{$cpu}{ivecs}{$cookie}{time} = $intrcfg->{time}; 250bd335c64Sesolom $stat{$cpu}{ivecs}{$cookie}{crtime} = $intrcfg->{crtime}; 251bd335c64Sesolom $stat{$cpu}{ivecs}{$cookie}{pil} = $intrcfg->{pil}; 252bd335c64Sesolom $stat{$cpu}{ivecs}{$cookie}{ino} = $intrcfg->{ino}; 2532917a9c9Sschwartz $stat{$cpu}{ivecs}{$cookie}{num_ino} = 1; 254bd335c64Sesolom $stat{$cpu}{ivecs}{$cookie}{buspath} = $intrcfg->{buspath}; 255bd335c64Sesolom $stat{$cpu}{ivecs}{$cookie}{name} = $intrcfg->{name}; 256bd335c64Sesolom $stat{$cpu}{ivecs}{$cookie}{ihs} = 1; 2572917a9c9Sschwartz 2582917a9c9Sschwartz if ($pcplusmp_sys && ($intrcfg->{type} =~ /^msi\0/)) { 2592917a9c9Sschwartz if (!(exists($msidevs{$intrcfg->{name}}))) { 2602917a9c9Sschwartz $msidevs{$intrcfg->{name}} = {}; 2612917a9c9Sschwartz } 2622917a9c9Sschwartz $msidevs{$intrcfg->{name}}{$intrcfg->{ino}} = 2632917a9c9Sschwartz \$stat{$cpu}{ivecs}{$cookie}; 2642917a9c9Sschwartz } 2652917a9c9Sschwartz } 2662917a9c9Sschwartz 2672917a9c9Sschwartz # All MSI interrupts of a device instance share a single MSI address. 2682917a9c9Sschwartz # On X86 systems with an APIC, this MSI address is interpreted as CPU 2692917a9c9Sschwartz # routing info by the APIC. For this reason, on these platforms, all 2702917a9c9Sschwartz # interrupts for MSI devices must be moved to the same CPU at the same 2712917a9c9Sschwartz # time. 2722917a9c9Sschwartz # 2732917a9c9Sschwartz # Since all interrupts will be on the same CPU on these platforms, all 2742917a9c9Sschwartz # interrupts can be consolidated into one ivec entry. For such devices, 2752917a9c9Sschwartz # num_ino will be > 1 to denote that a group move is needed. 2762917a9c9Sschwartz 2772917a9c9Sschwartz # Loop thru all MSI devices on X86 pcplusmp systems. 2782917a9c9Sschwartz # Nop on other systems. 2792917a9c9Sschwartz foreach my $msidevkey (sort keys %msidevs) { 2802917a9c9Sschwartz 2812917a9c9Sschwartz # Loop thru inos of the device, sorted by lowest value first 2822917a9c9Sschwartz # For each cookie found for a device, incr num_ino for the 2832917a9c9Sschwartz # lowest cookie and remove other cookies. 2842917a9c9Sschwartz 2852917a9c9Sschwartz # Assumes PIL is the same for first and current cookies 2862917a9c9Sschwartz 2872917a9c9Sschwartz my $first_ino = -1; 2882917a9c9Sschwartz my $first_cookiep; 2892917a9c9Sschwartz my $curr_cookiep; 2902917a9c9Sschwartz foreach my $inokey (sort keys %{$msidevs{$msidevkey}}) { 2912917a9c9Sschwartz $curr_cookiep = $msidevs{$msidevkey}{$inokey}; 2922917a9c9Sschwartz if ($first_ino == -1) { 2932917a9c9Sschwartz $first_ino = $inokey; 2942917a9c9Sschwartz $first_cookiep = $curr_cookiep; 2952917a9c9Sschwartz } else { 2962917a9c9Sschwartz $$first_cookiep->{num_ino}++; 2972917a9c9Sschwartz $$first_cookiep->{time} += 2982917a9c9Sschwartz $$curr_cookiep->{time}; 2992917a9c9Sschwartz if ($$curr_cookiep->{crtime} > 3002917a9c9Sschwartz $$first_cookiep->{crtime}) { 3012917a9c9Sschwartz $$first_cookiep->{crtime} = 3022917a9c9Sschwartz $$curr_cookiep->{crtime}; 3032917a9c9Sschwartz } 3042917a9c9Sschwartz # Invalidate this cookie, less complicated and 3052917a9c9Sschwartz # more efficient than deleting it. 3062917a9c9Sschwartz $$curr_cookiep->{num_ino} = 0; 3072917a9c9Sschwartz } 3082917a9c9Sschwartz } 309bd335c64Sesolom } 310bd335c64Sesolom 311bd335c64Sesolom # We define the timerange as the amount of time spent gathering the 312bd335c64Sesolom # various kstats, divided by our sleeptime. If we take a lot of time 313bd335c64Sesolom # to access the kstats, and then we create a delta comparing these 314bd335c64Sesolom # kstats with a prior set of kstats, that delta will cover 315bd335c64Sesolom # substaintially different amount of time depending upon which 316bd335c64Sesolom # interrupt or CPU is being examined. 317bd335c64Sesolom # 318bd335c64Sesolom # By checking the timerange here, we guarantee that any deltas 319bd335c64Sesolom # created from these kstats will contain self-consistent data, 320bd335c64Sesolom # in that all CPUs and interrupts cover a similar span of time. 321bd335c64Sesolom # 3229e59f930Sesolom # $timerange_toohi is the upper bound. Any timerange above 323bd335c64Sesolom # this is thrown out as garbage. If the stat is safely within this 324bd335c64Sesolom # bound, we treat the stat as representing an instant in time, rather 325bd335c64Sesolom # than the time range it actually spans. We arbitrarily choose minsnap 326bd335c64Sesolom # as the snaptime of the stat. 327bd335c64Sesolom 328bd335c64Sesolom $stat{snaptime} = $minsnap; 329bd335c64Sesolom my $timerange = ($maxsnap - $minsnap) / $sleeptime; 330bd335c64Sesolom return (0) if ($timerange > $timerange_toohi); # i.e. failure 331bd335c64Sesolom return (\%stat); 332bd335c64Sesolom} 333bd335c64Sesolom 334bd335c64Sesolom# 335bd335c64Sesolom# dumpdelta takes a reference to our "delta" structure: 336bd335c64Sesolom# {"missing"} "1" if the delta's component stats had inconsistencies 337bd335c64Sesolom# {"minsnap"} time of the first kstat snaptime used in this delta 338bd335c64Sesolom# {"maxsnap"} time of the last kstat snaptime used in this delta 339bd335c64Sesolom# {"goodness"} cost function applied to this delta 340bd335c64Sesolom# {"avgintrload"} avg of interrupt load across cpus, as a percentage 341bd335c64Sesolom# {"avgintrnsec"} avg number of nsec spent in interrupts, per cpu 342bd335c64Sesolom# {<cpuid>} iterates over on-line cpus 343bd335c64Sesolom# ->{"intrs"} cpu's movable intr time (sum of "time" for each ivec) 3449e59f930Sesolom# ->{"tot"} CPU load from all sources in nsec 345bd335c64Sesolom# ->{"bigintr"} largest value of {ivecs}{<ivec#>}{time} from below 346bd335c64Sesolom# ->{"intrload"} intrs / tot 347bd335c64Sesolom# ->{"ivecs"} 348bd335c64Sesolom# ->{<ivec#>} iterates over ivecs for this cpu 349bd335c64Sesolom# ->{"time"} time used by this interrupt (in nsec) 350bd335c64Sesolom# ->{"pil"} pil level of this interrupt 3512917a9c9Sschwartz# ->{"ino"} interrupt number (or base vector if MSI group) 352bd335c64Sesolom# ->{"buspath"} filename of the directory of the device's bus 353bd335c64Sesolom# ->{"name"} device name 354bd335c64Sesolom# ->{"ihs"} number of different handlers sharing this ino 3552917a9c9Sschwartz# ->{"num_ino"} number of interrupt vectors in MSI group 356bd335c64Sesolom# 357bd335c64Sesolom# It prints out the delta structure in a nice, human readable display. 358bd335c64Sesolom# 359bd335c64Sesolom 360bd335c64Sesolomsub dumpdelta($) 361bd335c64Sesolom{ 362bd335c64Sesolom my ($delta) = @_; 363bd335c64Sesolom 364bd335c64Sesolom # print global info 365bd335c64Sesolom 366bd335c64Sesolom syslog('debug', "dumpdelta:"); 367bd335c64Sesolom syslog('debug', " RECONFIGURATION IN DELTA") if $delta->{missing} > 0; 368bd335c64Sesolom syslog('debug', " avgintrload: %5.2f%% avgintrnsec: %d", 369bd335c64Sesolom $delta->{avgintrload} * 100, $delta->{avgintrnsec}); 370bd335c64Sesolom syslog('debug', " goodness: %5.2f%%", $delta->{goodness} * 100) 371bd335c64Sesolom if exists($delta->{goodness}); 372bd335c64Sesolom 373bd335c64Sesolom # iterate over cpus 374bd335c64Sesolom 375bd335c64Sesolom while (my ($cpu, $cpst) = each %$delta) { 376bd335c64Sesolom next if !ref($cpst); # skip non-cpuid entries 377bd335c64Sesolom my $tot = $cpst->{tot}; 378bd335c64Sesolom syslog('debug', " cpu %3d intr %7.3f%% (bigintr %7.3f%%)", 379bd335c64Sesolom $cpu, $cpst->{intrload}*100, $cpst->{bigintr}*100/$tot); 380bd335c64Sesolom syslog('debug', " intrs %d, bigintr %d", 381bd335c64Sesolom $cpst->{intrs}, $cpst->{bigintr}); 382bd335c64Sesolom 383bd335c64Sesolom # iterate over ivecs on this cpu 384bd335c64Sesolom 385bd335c64Sesolom while (my ($ivec, $ivst) = each %{$cpst->{ivecs}}) { 3865bb4956eSesolom syslog('debug', " %15s:\"%s\": %7.3f%% %d", 3875bb4956eSesolom ($ivst->{ihs} > 1 ? "$ivst->{name}($ivst->{ihs})" : 3885bb4956eSesolom $ivst->{name}), $ivec, 3895bb4956eSesolom $ivst->{time}*100 / $tot, $ivst->{time}); 390bd335c64Sesolom } 391bd335c64Sesolom } 392bd335c64Sesolom} 393bd335c64Sesolom 394bd335c64Sesolom# 395bd335c64Sesolom# generate_delta($stat, $newstat) takes two stat references, returned from 396bd335c64Sesolom# getstat(), and creates a %delta. %delta (not surprisingly) contains the 397bd335c64Sesolom# same basic info as stat and newstat, but with the timestamps as deltas 398bd335c64Sesolom# instead of absolute times. We return a reference to the delta. 399bd335c64Sesolom# 400bd335c64Sesolom 401bd335c64Sesolomsub generate_delta($$) 402bd335c64Sesolom{ 403bd335c64Sesolom my ($stat, $newstat) = @_; 404bd335c64Sesolom 405bd335c64Sesolom my %delta = (); 406bd335c64Sesolom my $intrload; 407bd335c64Sesolom my $intrnsec; 408bd335c64Sesolom my $cpus; 409bd335c64Sesolom 410bd335c64Sesolom # Take the worstcase timerange 411bd335c64Sesolom $delta{minsnap} = $stat->{snaptime}; 412bd335c64Sesolom $delta{maxsnap} = $newstat->{snaptime}; 413bd335c64Sesolom if (VERIFY($delta{maxsnap} > $delta{minsnap}, 414bd335c64Sesolom "generate_delta: stats aren't ascending")) { 415bd335c64Sesolom $delta{missing} = 1; 416bd335c64Sesolom return (\%delta); 417bd335c64Sesolom } 418bd335c64Sesolom 419bd335c64Sesolom # if there are a different number of cpus in the stats, set missing 420bd335c64Sesolom 421bd335c64Sesolom $delta{missing} = (keys(%$stat) != keys(%$newstat)); 422bd335c64Sesolom if (VERIFY($delta{missing} == 0, 423bd335c64Sesolom "generate_delta: number of CPUs changed")) { 424bd335c64Sesolom return (\%delta); 425bd335c64Sesolom } 426bd335c64Sesolom 427bd335c64Sesolom # scan through every cpu in %newstat and compare against %stat 428bd335c64Sesolom 429bd335c64Sesolom while (my ($cpu, $newcpst) = each %$newstat) { 430bd335c64Sesolom next if !ref($newcpst); # skip non-cpuid fields 431bd335c64Sesolom 432bd335c64Sesolom # If %stat is missing a cpu from %newstat, then it was just 433bd335c64Sesolom # onlined. Mark missing. 434bd335c64Sesolom 435bd335c64Sesolom if (VERIFY(exists $stat->{$cpu} && 436bd335c64Sesolom $stat->{$cpu}{crtime} == $newcpst->{crtime}, 437bd335c64Sesolom "generate_delta: cpu $cpu changed")) { 438bd335c64Sesolom $delta{missing} = 1; 439bd335c64Sesolom return (\%delta); 440bd335c64Sesolom } 441bd335c64Sesolom my $cpst = $stat->{$cpu}; 442bd335c64Sesolom $delta{$cpu}{tot} = $newcpst->{tot} - $cpst->{tot}; 443bd335c64Sesolom if (VERIFY($delta{$cpu}{tot} >= 0, 444bd335c64Sesolom "generate_delta: deltas are not ascending?")) { 445bd335c64Sesolom $delta{missing} = 1; 446bd335c64Sesolom delete($delta{$cpu}); 447bd335c64Sesolom return (\%delta); 448bd335c64Sesolom } 449bd335c64Sesolom # Avoid remote chance of division by zero 450bd335c64Sesolom $delta{$cpu}{tot} = 1 if $delta{$cpu}{tot} == 0; 451bd335c64Sesolom $delta{$cpu}{intrs} = 0; 452bd335c64Sesolom $delta{$cpu}{bigintr} = 0; 453bd335c64Sesolom 454bd335c64Sesolom my %ivecs = (); 455bd335c64Sesolom $delta{$cpu}{ivecs} = \%ivecs; 456bd335c64Sesolom 457bd335c64Sesolom # if the number of ivecs differs, set missing 458bd335c64Sesolom 459bd335c64Sesolom if (VERIFY(keys(%{$cpst->{ivecs}}) == 460bd335c64Sesolom keys(%{$newcpst->{ivecs}}), 461bd335c64Sesolom "generate_delta: cpu $cpu has more/less". 462bd335c64Sesolom " interrupts")) { 463bd335c64Sesolom $delta{missing} = 1; 464bd335c64Sesolom return (\%delta); 465bd335c64Sesolom } 466bd335c64Sesolom 467bd335c64Sesolom while (my ($inum, $newivec) = each %{$newcpst->{ivecs}}) { 4682917a9c9Sschwartz 4692917a9c9Sschwartz # Unused cookie, corresponding to an MSI vector which 4702917a9c9Sschwartz # is part of a group. The whole group is accounted for 4712917a9c9Sschwartz # by a different cookie. 4722917a9c9Sschwartz next if ($newivec->{num_ino} == 0); 4732917a9c9Sschwartz 474bd335c64Sesolom # If this ivec doesn't exist in $stat, or if $stat 475bd335c64Sesolom # shows a different crtime, set missing. 476bd335c64Sesolom if (VERIFY(exists $cpst->{ivecs}{$inum} && 477bd335c64Sesolom $cpst->{ivecs}{$inum}{crtime} == 478bd335c64Sesolom $newivec->{crtime}, 479bd335c64Sesolom "generate_delta: cpu $cpu inum $inum". 480bd335c64Sesolom " has changed")) { 481bd335c64Sesolom $delta{missing} = 1; 482bd335c64Sesolom return (\%delta); 483bd335c64Sesolom } 484bd335c64Sesolom my $ivec = $cpst->{ivecs}{$inum}; 485bd335c64Sesolom 486bd335c64Sesolom # Create $delta{$cpu}{ivecs}{$inum}. 487bd335c64Sesolom 488bd335c64Sesolom my %dltivec = (); 489bd335c64Sesolom $delta{$cpu}{ivecs}{$inum} = \%dltivec; 490bd335c64Sesolom 491bd335c64Sesolom # calculate time used by this interrupt 492bd335c64Sesolom 493bd335c64Sesolom my $time = $newivec->{time} - $ivec->{time}; 494bd335c64Sesolom if (VERIFY($time >= 0, 495bd335c64Sesolom "generate_delta: ivec went backwards?")) { 496bd335c64Sesolom $delta{missing} = 1; 497bd335c64Sesolom delete($delta{$cpu}{ivecs}{$inum}); 498bd335c64Sesolom return (\%delta); 499bd335c64Sesolom } 500bd335c64Sesolom $delta{$cpu}{intrs} += $time; 501bd335c64Sesolom $dltivec{time} = $time; 502bd335c64Sesolom if ($time > $delta{$cpu}{bigintr}) { 503bd335c64Sesolom $delta{$cpu}{bigintr} = $time; 504bd335c64Sesolom } 505bd335c64Sesolom 506bd335c64Sesolom # Transfer over basic info about the kstat. We 507bd335c64Sesolom # don't have to worry about discrepancies between 508bd335c64Sesolom # ivec and newivec because we verified that both 509bd335c64Sesolom # have the same crtime. 510bd335c64Sesolom 511bd335c64Sesolom $dltivec{pil} = $newivec->{pil}; 512bd335c64Sesolom $dltivec{ino} = $newivec->{ino}; 513bd335c64Sesolom $dltivec{buspath} = $newivec->{buspath}; 514bd335c64Sesolom $dltivec{name} = $newivec->{name}; 515bd335c64Sesolom $dltivec{ihs} = $newivec->{ihs}; 5162917a9c9Sschwartz $dltivec{num_ino} = $newivec->{num_ino}; 517bd335c64Sesolom } 518bd335c64Sesolom if ($delta{$cpu}{tot} < $delta{$cpu}{intrs}) { 519bd335c64Sesolom # Ewww! Hopefully just a rounding error. 520bd335c64Sesolom # Make something up. 521bd335c64Sesolom $delta{$cpu}{tot} = $delta{$cpu}{intrs}; 522bd335c64Sesolom } 523bd335c64Sesolom $delta{$cpu}{intrload} = 524bd335c64Sesolom $delta{$cpu}{intrs} / $delta{$cpu}{tot}; 525bd335c64Sesolom $intrload += $delta{$cpu}{intrload}; 526bd335c64Sesolom $intrnsec += $delta{$cpu}{intrs}; 527bd335c64Sesolom $cpus++; 528bd335c64Sesolom } 529bd335c64Sesolom if ($cpus > 0) { 530bd335c64Sesolom $delta{avgintrload} = $intrload / $cpus; 531bd335c64Sesolom $delta{avgintrnsec} = $intrnsec / $cpus; 532bd335c64Sesolom } else { 533bd335c64Sesolom $delta{avgintrload} = 0; 534bd335c64Sesolom $delta{avgintrnsec} = 0; 535bd335c64Sesolom } 536bd335c64Sesolom return (\%delta); 537bd335c64Sesolom} 538bd335c64Sesolom 539bd335c64Sesolom 540bd335c64Sesolom# compress_delta takes a list of deltas, and returns a single new delta 541bd335c64Sesolom# which represents the combined information from all the deltas. The deltas 542bd335c64Sesolom# provided are assumed to be sequential in time. The resulting compressed 543bd335c64Sesolom# delta looks just like any other delta. This new delta is also more accurate 544bd335c64Sesolom# since its statistics are averaged over a longer period than any of the 545bd335c64Sesolom# original deltas. 546bd335c64Sesolom 547bd335c64Sesolomsub compress_deltas ($) 548bd335c64Sesolom{ 549bd335c64Sesolom my ($deltas) = @_; 550bd335c64Sesolom 551bd335c64Sesolom my %newdelta = (); 552bd335c64Sesolom my ($intrs, $tot); 553bd335c64Sesolom my $cpus = 0; 5549e59f930Sesolom my ($high_intrload) = 0; 555bd335c64Sesolom 556bd335c64Sesolom if (VERIFY($#$deltas != -1, 557bd335c64Sesolom "compress_deltas: list of delta is empty?")) { 558bd335c64Sesolom return (0); 559bd335c64Sesolom } 560bd335c64Sesolom $newdelta{minsnap} = $deltas->[0]{minsnap}; 561bd335c64Sesolom $newdelta{maxsnap} = $deltas->[$#$deltas]{maxsnap}; 562bd335c64Sesolom $newdelta{missing} = 0; 563bd335c64Sesolom 564bd335c64Sesolom foreach my $delta (@$deltas) { 565bd335c64Sesolom if (VERIFY($delta->{missing} == 0, 566bd335c64Sesolom "compressing bad deltas?")) { 567bd335c64Sesolom return (0); 568bd335c64Sesolom } 569bd335c64Sesolom while (my ($cpuid, $cpu) = each %$delta) { 570bd335c64Sesolom next if !ref($cpu); 571bd335c64Sesolom 572bd335c64Sesolom $intrs += $cpu->{intrs}; 573bd335c64Sesolom $tot += $cpu->{tot}; 574bd335c64Sesolom $newdelta{$cpuid}{intrs} += $cpu->{intrs}; 575bd335c64Sesolom $newdelta{$cpuid}{tot} += $cpu->{tot}; 576bd335c64Sesolom if (!exists $newdelta{$cpuid}{ivecs}) { 577bd335c64Sesolom my %ivecs = (); 578bd335c64Sesolom $newdelta{$cpuid}{ivecs} = \%ivecs; 579bd335c64Sesolom } 580bd335c64Sesolom while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) { 581bd335c64Sesolom my $newivecs = $newdelta{$cpuid}{ivecs}; 582bd335c64Sesolom $newivecs->{$inum}{time} += $ivec->{time}; 583bd335c64Sesolom $newivecs->{$inum}{pil} = $ivec->{pil}; 584bd335c64Sesolom $newivecs->{$inum}{ino} = $ivec->{ino}; 585bd335c64Sesolom $newivecs->{$inum}{buspath} = $ivec->{buspath}; 586bd335c64Sesolom $newivecs->{$inum}{name} = $ivec->{name}; 587bd335c64Sesolom $newivecs->{$inum}{ihs} = $ivec->{ihs}; 5882917a9c9Sschwartz $newivecs->{$inum}{num_ino} = $ivec->{num_ino}; 589bd335c64Sesolom } 590bd335c64Sesolom } 591bd335c64Sesolom } 592bd335c64Sesolom foreach my $cpu (values(%newdelta)) { 593bd335c64Sesolom next if !ref($cpu); # ignore non-cpu fields 594bd335c64Sesolom $cpus++; 595bd335c64Sesolom 596bd335c64Sesolom my $bigintr = 0; 597bd335c64Sesolom foreach my $ivec (values(%{$cpu->{ivecs}})) { 598bd335c64Sesolom if ($ivec->{time} > $bigintr) { 599bd335c64Sesolom $bigintr = $ivec->{time}; 600bd335c64Sesolom } 601bd335c64Sesolom } 602bd335c64Sesolom $cpu->{bigintr} = $bigintr; 603bd335c64Sesolom $cpu->{intrload} = $cpu->{intrs} / $cpu->{tot}; 6049e59f930Sesolom if ($high_intrload < $cpu->{intrload}) { 6059e59f930Sesolom $high_intrload = $cpu->{intrload}; 6069e59f930Sesolom } 607bd335c64Sesolom $cpu->{tot} = 1 if $cpu->{tot} <= 0; 608bd335c64Sesolom } 609bd335c64Sesolom if ($cpus == 0) { 610bd335c64Sesolom $newdelta{avgintrnsec} = 0; 611bd335c64Sesolom $newdelta{avgintrload} = 0; 612bd335c64Sesolom } else { 613bd335c64Sesolom $newdelta{avgintrnsec} = $intrs / $cpus; 614bd335c64Sesolom $newdelta{avgintrload} = $intrs / $tot; 615bd335c64Sesolom } 6169e59f930Sesolom $sleeptime = ($high_intrload < $idle_intrload) ? $idle_sleeptime : 6179e59f930Sesolom $normal_sleeptime; 618bd335c64Sesolom return (\%newdelta); 619bd335c64Sesolom} 620bd335c64Sesolom 621bd335c64Sesolom 622bd335c64Sesolom 623bd335c64Sesolom 624bd335c64Sesolom 625bd335c64Sesolom# What follow are the core functions responsible for examining the deltas 626bd335c64Sesolom# generated above and deciding what to do about them. 627bd335c64Sesolom# 628bd335c64Sesolom# goodness() and its helper goodness_cpu() return a heuristic which describe 629bd335c64Sesolom# how good (or bad) the current interrupt balance is. The value returned will 630bd335c64Sesolom# be between 0 and 1, with 0 representing maximum goodness, and 1 representing 631bd335c64Sesolom# maximum badness. 632bd335c64Sesolom# 633bd335c64Sesolom# imbalanced() compares a current and historical value of goodness, and 634bd335c64Sesolom# determines if there has been enough change to warrant evaluating a 635bd335c64Sesolom# reconfiguration of the interrupts 636bd335c64Sesolom# 637bd335c64Sesolom# do_reconfig(), and its helpers, do_reconfig_cpu(), do_reconfig_cpu2cpu(), 638bd335c64Sesolom# find_goal(), do_find_goal(), and move_intr(), are responsible for examining 639bd335c64Sesolom# a delta and determining the best possible assignment of interrupts to CPUs. 640bd335c64Sesolom# 641bd335c64Sesolom# It is important that do_reconfig() be in alignment with goodness(). If 642bd335c64Sesolom# do_reconfig were to generate a new interrupt distribution that worsened 643bd335c64Sesolom# goodness, we could get into a pathological loop with intrd fighting itself, 644bd335c64Sesolom# constantly deciding that things are imbalanced, and then changing things 645bd335c64Sesolom# only to make them worse. 646bd335c64Sesolom 647bd335c64Sesolom 648bd335c64Sesolom 649bd335c64Sesolom# any goodness over $goodness_unsafe_load is considered really bad 650bd335c64Sesolom# goodness must drop by at least $goodness_mindelta for a reconfig 651bd335c64Sesolom 652bd335c64Sesolommy $goodness_unsafe_load = .9; 653bd335c64Sesolommy $goodness_mindelta = .1; 654bd335c64Sesolom 655bd335c64Sesolom# goodness(%delta) examines a delta and return its "goodness". goodness will 656bd335c64Sesolom# be between 0 (best) and 1 (major bad). goodness is determined by evaluating 657bd335c64Sesolom# the goodness of each individual cpu, and returning the worst case. This 658bd335c64Sesolom# helps on systems with many CPUs, where otherwise a single pathological CPU 659bd335c64Sesolom# might otherwise be ignored because the average was OK. 660bd335c64Sesolom# 661bd335c64Sesolom# To calculate the goodness of an individual CPU, we start by looking at its 662bd335c64Sesolom# load due to interrupts. If the load is above a certain high threshold and 663bd335c64Sesolom# there is more than one interrupt assigned to this CPU, we set goodness 664bd335c64Sesolom# to worst-case. If the load is below the average interrupt load of all CPUs, 665bd335c64Sesolom# then we return best-case, since what's to complain about? 666bd335c64Sesolom# 667bd335c64Sesolom# Otherwise we look at how much the load is above the average, and return 668bd335c64Sesolom# that as the goodness, with one caveat: we never return more than the CPU's 669bd335c64Sesolom# interrupt load ignoring its largest single interrupt source. This is 670bd335c64Sesolom# because a CPU with one high-load interrupt, and no other interrupts, is 671bd335c64Sesolom# perfectly balanced. Nothing can be done to improve the situation, and thus 672bd335c64Sesolom# it is perfectly balanced even if the interrupt's load is 100%. 673bd335c64Sesolom 674bd335c64Sesolomsub goodness($) 675bd335c64Sesolom{ 676bd335c64Sesolom my ($delta) = @_; 677bd335c64Sesolom 678bd335c64Sesolom return (1) if $delta->{missing} > 0; 679bd335c64Sesolom 680bd335c64Sesolom my $high_goodness = 0; 681bd335c64Sesolom my $goodness; 682bd335c64Sesolom 683bd335c64Sesolom foreach my $cpu (values(%$delta)) { 684bd335c64Sesolom next if !ref($cpu); # skip non-cpuid fields 685bd335c64Sesolom 686bd335c64Sesolom $goodness = goodness_cpu($cpu, $delta->{avgintrload}); 687bd335c64Sesolom if (VERIFY($goodness >= 0 && $goodness <= 1, 688bd335c64Sesolom "goodness: cpu goodness out of range?")) { 689bd335c64Sesolom dumpdelta($delta); 690bd335c64Sesolom return (1); 691bd335c64Sesolom } 692bd335c64Sesolom if ($goodness == 1) { 693bd335c64Sesolom return (1); # worst case, no need to continue 694bd335c64Sesolom } 695bd335c64Sesolom if ($goodness > $high_goodness) { 696bd335c64Sesolom $high_goodness = $goodness; 697bd335c64Sesolom } 698bd335c64Sesolom } 699bd335c64Sesolom return ($high_goodness); 700bd335c64Sesolom} 701bd335c64Sesolom 702bd335c64Sesolomsub goodness_cpu($$) # private function 703bd335c64Sesolom{ 704bd335c64Sesolom my ($cpu, $avgintrload) = @_; 705bd335c64Sesolom 706bd335c64Sesolom my $goodness; 707bd335c64Sesolom my $load = $cpu->{intrs} / $cpu->{tot}; 708bd335c64Sesolom 709bd335c64Sesolom return (0) if ($load < $avgintrload); # low loads are perfectly good 710bd335c64Sesolom 711bd335c64Sesolom # Calculate $load_no_bigintr, which represents the load 712bd335c64Sesolom # due to interrupts, excluding the one biggest interrupt. 713bd335c64Sesolom # This is the most gain we can get on this CPU from 714bd335c64Sesolom # offloading interrupts. 715bd335c64Sesolom 716bd335c64Sesolom my $load_no_bigintr = ($cpu->{intrs} - $cpu->{bigintr}) / $cpu->{tot}; 717bd335c64Sesolom 718bd335c64Sesolom # A major imbalance is indicated if a CPU is saturated 719bd335c64Sesolom # with interrupt handling, and it has more than one 720bd335c64Sesolom # source of interrupts. Those other interrupts could be 721bd335c64Sesolom # starved if of a lower pil. Return a goodness of 1, 722bd335c64Sesolom # which is the worst possible return value, 723bd335c64Sesolom # which will effectively contaminate this entire delta. 724bd335c64Sesolom 725bd335c64Sesolom my $cnt = keys(%{$cpu->{ivecs}}); 726bd335c64Sesolom 727bd335c64Sesolom if ($load > $goodness_unsafe_load && $cnt > 1) { 728bd335c64Sesolom return (1); 729bd335c64Sesolom } 730bd335c64Sesolom $goodness = $load - $avgintrload; 731bd335c64Sesolom if ($goodness > $load_no_bigintr) { 732bd335c64Sesolom $goodness = $load_no_bigintr; 733bd335c64Sesolom } 734bd335c64Sesolom return ($goodness); 735bd335c64Sesolom} 736bd335c64Sesolom 737bd335c64Sesolom 738bd335c64Sesolom# imbalanced() is used by the main routine to determine if the goodness 739bd335c64Sesolom# has shifted far enough from our last baseline to warrant a reassignment 740bd335c64Sesolom# of interrupts. A very high goodness indicates that a CPU is way out of 741bd335c64Sesolom# whack. If the goodness has varied too much since the baseline, then 742bd335c64Sesolom# perhaps a reconfiguration is worth considering. 743bd335c64Sesolom 744bd335c64Sesolomsub imbalanced ($$) 745bd335c64Sesolom{ 746bd335c64Sesolom my ($goodness, $baseline) = @_; 747bd335c64Sesolom 748bd335c64Sesolom # Return 1 if we are pathological, or creeping away from the baseline 749bd335c64Sesolom 750bd335c64Sesolom return (1) if $goodness > .50; 751bd335c64Sesolom return (1) if abs($goodness - $baseline) > $goodness_mindelta; 752bd335c64Sesolom return (0); 753bd335c64Sesolom} 754bd335c64Sesolom 755bd335c64Sesolom# do_reconfig(), do_reconfig_cpu(), and do_reconfig_cpu2cpu(), are the 756bd335c64Sesolom# decision-making functions responsible for generating a new interrupt 757bd335c64Sesolom# distribution. They are designed with the definition of goodness() in 758bd335c64Sesolom# mind, i.e. they use the same definition of "good distribution" as does 759bd335c64Sesolom# goodness(). 760bd335c64Sesolom# 761bd335c64Sesolom# do_reconfig() is responsible for deciding whether a redistribution is 762bd335c64Sesolom# actually warranted. If the goodness is already pretty good, it doesn't 763bd335c64Sesolom# waste the CPU time to generate a new distribution. If it 764bd335c64Sesolom# calculates a new distribution and finds that it is not sufficiently 765bd335c64Sesolom# improved from the prior distirbution, it will not do the redistribution, 766bd335c64Sesolom# mainly to avoid the disruption to system performance caused by 767bd335c64Sesolom# rejuggling interrupts. 768bd335c64Sesolom# 769bd335c64Sesolom# Its main loop works by going through a list of cpus sorted from 770bd335c64Sesolom# highest to lowest interrupt load. It removes the highest-load cpus 771bd335c64Sesolom# one at a time and hands them off to do_reconfig_cpu(). This function 772bd335c64Sesolom# then re-sorts the remaining CPUs from lowest to highest interrupt load, 773bd335c64Sesolom# and one at a time attempts to rejuggle interrupts between the original 774bd335c64Sesolom# high-load CPU and the low-load CPU. Rejuggling on a high-load CPU is 775bd335c64Sesolom# considered finished as soon as its interrupt load is within 776bd335c64Sesolom# $goodness_mindelta of the average interrupt load. Such a CPU will have 777bd335c64Sesolom# a goodness of below the $goodness_mindelta threshold. 778bd335c64Sesolom 779bd335c64Sesolom# 780bd335c64Sesolom# move_intr(\%delta, $inum, $oldcpu, $newcpu) 781bd335c64Sesolom# used by reconfiguration code to move an interrupt between cpus within 782bd335c64Sesolom# a delta. This manipulates data structures, and does not actually move 783bd335c64Sesolom# the interrupt on the running system. 784bd335c64Sesolom# 785bd335c64Sesolomsub move_intr($$$$) # private function 786bd335c64Sesolom{ 787bd335c64Sesolom my ($delta, $inum, $oldcpuid, $newcpuid) = @_; 788bd335c64Sesolom 789bd335c64Sesolom my $ivec = $delta->{$oldcpuid}{ivecs}{$inum}; 790bd335c64Sesolom 791bd335c64Sesolom # Remove ivec from old cpu 792bd335c64Sesolom 793bd335c64Sesolom my $oldcpu = $delta->{$oldcpuid}; 794bd335c64Sesolom $oldcpu->{intrs} -= $ivec->{time}; 795bd335c64Sesolom $oldcpu->{intrload} = $oldcpu->{intrs} / $oldcpu->{tot}; 796bd335c64Sesolom delete($oldcpu->{ivecs}{$inum}); 797bd335c64Sesolom 798bd335c64Sesolom VERIFY($oldcpu->{intrs} >= 0, "move_intr: intr's time > total time?"); 799bd335c64Sesolom VERIFY($ivec->{time} <= $oldcpu->{bigintr}, 800bd335c64Sesolom "move_intr: intr's time > bigintr?"); 801bd335c64Sesolom 802bd335c64Sesolom if ($ivec->{time} >= $oldcpu->{bigintr}) { 803bd335c64Sesolom my $bigtime = 0; 804bd335c64Sesolom 805bd335c64Sesolom foreach my $ivec (values(%{$oldcpu->{ivecs}})) { 806bd335c64Sesolom $bigtime = $ivec->{time} if $ivec->{time} > $bigtime; 807bd335c64Sesolom } 808bd335c64Sesolom $oldcpu->{bigintr} = $bigtime; 809bd335c64Sesolom } 810bd335c64Sesolom 811bd335c64Sesolom # Add ivec onto new cpu 812bd335c64Sesolom 813bd335c64Sesolom my $newcpu = $delta->{$newcpuid}; 814bd335c64Sesolom 815bd335c64Sesolom $ivec->{nowcpu} = $newcpuid; 816bd335c64Sesolom $newcpu->{intrs} += $ivec->{time}; 817bd335c64Sesolom $newcpu->{intrload} = $newcpu->{intrs} / $newcpu->{tot}; 818bd335c64Sesolom $newcpu->{ivecs}{$inum} = $ivec; 819bd335c64Sesolom 820bd335c64Sesolom $newcpu->{bigintr} = $ivec->{time} 821bd335c64Sesolom if $ivec->{time} > $newcpu->{bigintr}; 822bd335c64Sesolom} 823bd335c64Sesolom 824bd335c64Sesolomsub move_intr_check($$$) # private function 825bd335c64Sesolom{ 826bd335c64Sesolom my ($delta, $oldcpuid, $newcpuid) = @_; 827bd335c64Sesolom 828bd335c64Sesolom VERIFY($delta->{$oldcpuid}{tot} >= $delta->{$oldcpuid}{intrs}, 829bd335c64Sesolom "Moved interrupts left 100+%% load on src cpu"); 830bd335c64Sesolom VERIFY($delta->{$newcpuid}{tot} >= $delta->{$newcpuid}{intrs}, 831bd335c64Sesolom "Moved interrupts left 100+%% load on tgt cpu"); 832bd335c64Sesolom} 833bd335c64Sesolom 834bd335c64Sesolomsub ivecs_to_string(@) # private function 835bd335c64Sesolom{ 836bd335c64Sesolom my $str = ""; 837bd335c64Sesolom foreach my $ivec (@_) { 838bd335c64Sesolom $str = "$str $ivec->{inum}"; 839bd335c64Sesolom } 840bd335c64Sesolom return ($str); 841bd335c64Sesolom} 842bd335c64Sesolom 843bd335c64Sesolom 844bd335c64Sesolomsub do_reconfig($) 845bd335c64Sesolom{ 846bd335c64Sesolom my ($delta) = @_; 847bd335c64Sesolom 848bd335c64Sesolom my $goodness = $delta->{goodness}; 849bd335c64Sesolom 850bd335c64Sesolom # We can't improve goodness to better than 0. We should stop here 851bd335c64Sesolom # if, even if we achieve a goodness of 0, the improvement is still 852bd335c64Sesolom # too small to merit the action. 853bd335c64Sesolom 854bd335c64Sesolom if ($goodness - 0 < $goodness_mindelta) { 855bd335c64Sesolom syslog('debug', "goodness good enough, don't reconfig"); 856bd335c64Sesolom return (0); 857bd335c64Sesolom } 858bd335c64Sesolom 859bd335c64Sesolom syslog('notice', "Optimizing interrupt assignments"); 860bd335c64Sesolom 861bd335c64Sesolom if (VERIFY ($delta->{missing} == 0, "RECONFIG Aborted: should not ". 862bd335c64Sesolom "have a delta with missing")) { 863bd335c64Sesolom return (-1); 864bd335c64Sesolom } 865bd335c64Sesolom 866bd335c64Sesolom # Make a list of all cpuids, and also add some extra information 867bd335c64Sesolom # to the ivec structures. 868bd335c64Sesolom 869bd335c64Sesolom my @cpusortlist = (); 870bd335c64Sesolom 871bd335c64Sesolom while (my ($cpuid, $cpu) = each %$delta) { 872bd335c64Sesolom next if !ref($cpu); # skip non-cpu entries 873bd335c64Sesolom 874bd335c64Sesolom push(@cpusortlist, $cpuid); 875bd335c64Sesolom while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) { 876bd335c64Sesolom $ivec->{origcpu} = $cpuid; 877bd335c64Sesolom $ivec->{nowcpu} = $cpuid; 878bd335c64Sesolom $ivec->{inum} = $inum; 879bd335c64Sesolom } 880bd335c64Sesolom } 881bd335c64Sesolom 882bd335c64Sesolom # Sort the list of CPUs from highest to lowest interrupt load. 883bd335c64Sesolom # Remove the top CPU from that list and attempt to redistribute 884bd335c64Sesolom # its interrupts. If the CPU has a goodness below a threshold, 885bd335c64Sesolom # just ignore the CPU and move to the next one. If the CPU's 886bd335c64Sesolom # load falls below the average load plus that same threshold, 887bd335c64Sesolom # then there are no CPUs left worth reconfiguring, and we're done. 888bd335c64Sesolom 889bd335c64Sesolom while (@cpusortlist) { 890bd335c64Sesolom # Re-sort cpusortlist each time, since do_reconfig_cpu can 891bd335c64Sesolom # move interrupts around. 892bd335c64Sesolom 893bd335c64Sesolom @cpusortlist = 894bd335c64Sesolom sort({$delta->{$b}{intrload} <=> $delta->{$a}{intrload}} 895bd335c64Sesolom @cpusortlist); 896bd335c64Sesolom 897bd335c64Sesolom my $cpu = shift(@cpusortlist); 898bd335c64Sesolom if (($delta->{$cpu}{intrload} <= $goodness_unsafe_load) && 899bd335c64Sesolom ($delta->{$cpu}{intrload} <= 900bd335c64Sesolom $delta->{avgintrload} + $goodness_mindelta)) { 901bd335c64Sesolom syslog('debug', "finished reconfig: cpu $cpu load ". 902bd335c64Sesolom "$delta->{$cpu}{intrload} avgload ". 903bd335c64Sesolom "$delta->{avgintrload}"); 904bd335c64Sesolom last; 905bd335c64Sesolom } 906bd335c64Sesolom if (goodness_cpu($delta->{$cpu}, $delta->{avgintrload}) < 907bd335c64Sesolom $goodness_mindelta) { 908bd335c64Sesolom next; 909bd335c64Sesolom } 910bd335c64Sesolom do_reconfig_cpu($delta, \@cpusortlist, $cpu); 911bd335c64Sesolom } 912bd335c64Sesolom 913bd335c64Sesolom # How good a job did we do? If the improvement was minimal, and 914bd335c64Sesolom # our goodness wasn't pathological (and thus needing any help it 915bd335c64Sesolom # can get), then don't bother moving the interrupts. 916bd335c64Sesolom 917bd335c64Sesolom my $newgoodness = goodness($delta); 918bd335c64Sesolom VERIFY($newgoodness <= $goodness, 919bd335c64Sesolom "reconfig: result has worse goodness?"); 920bd335c64Sesolom 921bd335c64Sesolom if (($goodness != 1 || $newgoodness == 1) && 922bd335c64Sesolom $goodness - $newgoodness < $goodness_mindelta) { 923bd335c64Sesolom syslog('debug', "goodness already near optimum, ". 924bd335c64Sesolom "don't reconfig"); 925bd335c64Sesolom return (0); 926bd335c64Sesolom } 927bd335c64Sesolom syslog('debug', "goodness %5.2f%% --> %5.2f%%", $goodness*100, 928bd335c64Sesolom $newgoodness*100); 929bd335c64Sesolom 930bd335c64Sesolom # Time to move those interrupts! 931bd335c64Sesolom 932bd335c64Sesolom my $ret = 1; 933bd335c64Sesolom my $warned = 0; 934bd335c64Sesolom while (my ($cpuid, $cpu) = each %$delta) { 935bd335c64Sesolom next if $cpuid =~ /\D/; 936bd335c64Sesolom while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) { 937bd335c64Sesolom next if ($ivec->{origcpu} == $cpuid); 938bd335c64Sesolom 9397ff178cdSJimmy Vetayases if (!intrmove($ivec->{buspath}, $ivec->{origcpu}, 9407ff178cdSJimmy Vetayases $ivec->{ino}, $cpuid, $ivec->{num_ino})) { 941bd335c64Sesolom syslog('warning', "Unable to move interrupts") 942bd335c64Sesolom if $warned++ == 0; 943bd335c64Sesolom syslog('debug', "Unable to move buspath ". 944bd335c64Sesolom "$ivec->{buspath} ino $ivec->{ino} to ". 945bd335c64Sesolom "cpu $cpuid"); 946bd335c64Sesolom $ret = -1; 947bd335c64Sesolom } 948bd335c64Sesolom } 949bd335c64Sesolom } 950bd335c64Sesolom 951bd335c64Sesolom syslog('notice', "Interrupt assignments optimized"); 952bd335c64Sesolom return ($ret); 953bd335c64Sesolom} 954bd335c64Sesolom 955bd335c64Sesolomsub do_reconfig_cpu($$$) # private function 956bd335c64Sesolom{ 957bd335c64Sesolom my ($delta, $cpusortlist, $oldcpuid) = @_; 958bd335c64Sesolom 959bd335c64Sesolom # We have been asked to rejuggle interrupts between $oldcpuid and 960bd335c64Sesolom # other CPUs found on $cpusortlist so as to improve the load on 961bd335c64Sesolom # $oldcpuid. We reverse $cpusortlist to get our own copy of the 962bd335c64Sesolom # list, sorted from lowest to highest interrupt load. One at a 963bd335c64Sesolom # time, shift a CPU off of this list of CPUs, and attempt to 964bd335c64Sesolom # rejuggle interrupts between the two CPUs. Don't do this if the 965bd335c64Sesolom # other CPU has a higher load than oldcpuid. We're done rejuggling 966bd335c64Sesolom # once $oldcpuid's goodness falls below a threshold. 967bd335c64Sesolom 968bd335c64Sesolom syslog('debug', "reconfiguring $oldcpuid"); 969bd335c64Sesolom 970bd335c64Sesolom my $cpu = $delta->{$oldcpuid}; 971bd335c64Sesolom my $avgintrload = $delta->{avgintrload}; 972bd335c64Sesolom 973bd335c64Sesolom my @cputargetlist = reverse(@$cpusortlist); # make a copy of the list 974bd335c64Sesolom while ($#cputargetlist != -1) { 975bd335c64Sesolom last if goodness_cpu($cpu, $avgintrload) < $goodness_mindelta; 976bd335c64Sesolom 977bd335c64Sesolom my $tgtcpuid = shift(@cputargetlist); 978bd335c64Sesolom my $tgt = $delta->{$tgtcpuid}; 979bd335c64Sesolom my $load = $cpu->{intrload}; 980bd335c64Sesolom my $tgtload = $tgt->{intrload}; 981bd335c64Sesolom last if $tgtload > $load; 982bd335c64Sesolom do_reconfig_cpu2cpu($delta, $oldcpuid, $tgtcpuid, $load); 983bd335c64Sesolom } 984bd335c64Sesolom} 985bd335c64Sesolom 986bd335c64Sesolomsub do_reconfig_cpu2cpu($$$$) # private function 987bd335c64Sesolom{ 988bd335c64Sesolom my ($delta, $srccpuid, $tgtcpuid, $srcload) = @_; 989bd335c64Sesolom 990bd335c64Sesolom # We've been asked to consider interrupt juggling between srccpuid 991bd335c64Sesolom # (with a high interrupt load) and tgtcpuid (with a lower interrupt 992bd335c64Sesolom # load). First, make a single list with all of the ivecs from both 993bd335c64Sesolom # CPUs, and sort the list from highest to lowest load. 994bd335c64Sesolom 995bd335c64Sesolom syslog('debug', "exchanging intrs between $srccpuid and $tgtcpuid"); 996bd335c64Sesolom 997bd335c64Sesolom # Gather together all the ivecs and sort by load 998bd335c64Sesolom 999bd335c64Sesolom my @ivecs = (values(%{$delta->{$srccpuid}{ivecs}}), 1000bd335c64Sesolom values(%{$delta->{$tgtcpuid}{ivecs}})); 1001bd335c64Sesolom return if $#ivecs == -1; 1002bd335c64Sesolom 1003bd335c64Sesolom @ivecs = sort({$b->{time} <=> $a->{time}} @ivecs); 1004bd335c64Sesolom 1005bd335c64Sesolom # Our "goal" load for srccpuid is the average load across all CPUs. 1006bd335c64Sesolom # find_goal() will find determine the optimum selection of the 1007bd335c64Sesolom # available interrupts which comes closest to this goal without 1008bd335c64Sesolom # falling below the goal. 1009bd335c64Sesolom 1010bd335c64Sesolom my $goal = $delta->{avgintrnsec}; 1011bd335c64Sesolom 1012bd335c64Sesolom # We know that the interrupt load on tgtcpuid is less than that on 1013bd335c64Sesolom # srccpuid, but its load could still be above avgintrnsec. Don't 1014bd335c64Sesolom # choose a goal which would bring srccpuid below the load on tgtcpuid. 1015bd335c64Sesolom 1016bd335c64Sesolom my $avgnsec = 1017bd335c64Sesolom ($delta->{$srccpuid}{intrs} + $delta->{$tgtcpuid}{intrs}) / 2; 1018bd335c64Sesolom if ($goal < $avgnsec) { 1019bd335c64Sesolom $goal = $avgnsec; 1020bd335c64Sesolom } 1021bd335c64Sesolom 1022bd335c64Sesolom # If the largest of the interrupts is on srccpuid, leave it there. 1023bd335c64Sesolom # This can help minimize the disruption caused by moving interrupts. 1024bd335c64Sesolom 1025bd335c64Sesolom if ($ivecs[0]->{origcpu} == $srccpuid) { 1026bd335c64Sesolom syslog('debug', "Keeping $ivecs[0]->{inum} on $srccpuid"); 1027bd335c64Sesolom $goal -= $ivecs[0]->{time}; 1028bd335c64Sesolom shift(@ivecs); 1029bd335c64Sesolom } 1030bd335c64Sesolom 1031bd335c64Sesolom syslog('debug', "GOAL: inums should total $goal"); 1032bd335c64Sesolom find_goal(\@ivecs, $goal); 1033bd335c64Sesolom 1034bd335c64Sesolom # find_goal() returned its results to us by setting $ivec->{goal} if 1035bd335c64Sesolom # the ivec should be on srccpuid, or clearing it for tgtcpuid. 1036bd335c64Sesolom # Call move_intr() to update our $delta with the new results. 1037bd335c64Sesolom 1038bd335c64Sesolom foreach my $ivec (@ivecs) { 1039bd335c64Sesolom syslog('debug', "ivec $ivec->{inum} goal $ivec->{goal}"); 1040bd335c64Sesolom VERIFY($ivec->{nowcpu} == $srccpuid || 1041bd335c64Sesolom $ivec->{nowcpu} == $tgtcpuid, "cpu2cpu found an ". 1042bd335c64Sesolom "interrupt not currently on src or tgt cpu"); 1043bd335c64Sesolom 1044bd335c64Sesolom if ($ivec->{goal} && $ivec->{nowcpu} != $srccpuid) { 1045bd335c64Sesolom move_intr($delta, $ivec->{inum}, $ivec->{nowcpu}, 1046bd335c64Sesolom $srccpuid); 1047bd335c64Sesolom } elsif ($ivec->{goal} == 0 && $ivec->{nowcpu} != $tgtcpuid) { 1048bd335c64Sesolom move_intr($delta, $ivec->{inum}, $ivec->{nowcpu}, 1049bd335c64Sesolom $tgtcpuid); 1050bd335c64Sesolom } 1051bd335c64Sesolom } 1052bd335c64Sesolom move_intr_check($delta, $srccpuid, $tgtcpuid); # asserts 1053bd335c64Sesolom 1054bd335c64Sesolom my $newload = $delta->{$srccpuid}{intrs} / $delta->{$srccpuid}{tot}; 1055bd335c64Sesolom VERIFY($newload <= $srcload && $newload > $delta->{avgintrload}, 1056bd335c64Sesolom "cpu2cpu: new load didn't end up in expected range"); 1057bd335c64Sesolom} 1058bd335c64Sesolom 1059bd335c64Sesolom 1060bd335c64Sesolom# find_goal() and its helper do_find_goal() are used to find the best 1061bd335c64Sesolom# combination of interrupts in order to generate a load that is as close 1062bd335c64Sesolom# as possible to a goal load without falling below that goal. Before returning 1063bd335c64Sesolom# to its caller, find_goal() sets a new value in the hash of each interrupt, 1064bd335c64Sesolom# {goal}, which if set signifies that this interrupt is one of the interrupts 1065bd335c64Sesolom# identified as part of the set of interrupts which best meet the goal. 1066bd335c64Sesolom# 1067bd335c64Sesolom# The arguments to find_goal are a list of ivecs (hash references), sorted 1068bd335c64Sesolom# by descending {time}, and the goal load. The goal is relative to {time}. 1069bd335c64Sesolom# The best fit is determined by performing a depth-first search. do_find_goal 1070bd335c64Sesolom# is the recursive subroutine which carries out the search. 1071bd335c64Sesolom# 1072bd335c64Sesolom# It is passed an index as an argument, originally 0. On a given invocation, 1073bd335c64Sesolom# it is only to consider interrupts in the ivecs array starting at that index. 1074bd335c64Sesolom# It then considers two possibilities: 1075bd335c64Sesolom# 1) What is the best goal-fit if I include ivecs[index]? 1076bd335c64Sesolom# 2) What is the best goal-fit if I exclude ivecs[index]? 1077bd335c64Sesolom# To determine case 1, it subtracts the load of ivecs[index] from the goal, 1078bd335c64Sesolom# and calls itself recursively with that new goal and index++. 1079bd335c64Sesolom# To determine case 2, it calls itself recursively with the same goal and 1080bd335c64Sesolom# index++. 1081bd335c64Sesolom# 1082bd335c64Sesolom# It then compares the two results, decide which one best meets the goals, 1083bd335c64Sesolom# and returns the result. The return value is the best-fit's interrupt load, 1084bd335c64Sesolom# followed by a list of all the interrupts which make up that best-fit. 1085bd335c64Sesolom# 1086bd335c64Sesolom# As an optimization, a second array loads[] is created which mirrors ivecs[]. 1087bd335c64Sesolom# loads[i] will equal the total loads of all ivecs[i..$#ivecs]. This is used 1088bd335c64Sesolom# by do_find_goal to avoid recursing all the way to the end of the ivecs 1089bd335c64Sesolom# array if including all remaining interrupts will still leave the best-fit 1090bd335c64Sesolom# at below goal load. If so, it then includes all remaining interrupts on 1091bd335c64Sesolom# the goal list and returns. 1092bd335c64Sesolom# 1093bd335c64Sesolomsub find_goal($$) # private function 1094bd335c64Sesolom{ 1095bd335c64Sesolom my ($ivecs, $goal) = @_; 1096bd335c64Sesolom 1097bd335c64Sesolom my @goals; 1098bd335c64Sesolom my $load; 1099bd335c64Sesolom my $ivec; 1100bd335c64Sesolom 1101bd335c64Sesolom if ($goal <= 0) { 1102bd335c64Sesolom @goals = (); # the empty set will best meet the goal 1103bd335c64Sesolom } else { 1104bd335c64Sesolom syslog('debug', "finding goal from intrs %s", 1105bd335c64Sesolom ivecs_to_string(@$ivecs)); 1106bd335c64Sesolom 1107bd335c64Sesolom # Generate @loads array 1108bd335c64Sesolom 1109bd335c64Sesolom my $tot = 0; 1110bd335c64Sesolom foreach $ivec (@$ivecs) { 1111bd335c64Sesolom $tot += $ivec->{time}; 1112bd335c64Sesolom } 1113bd335c64Sesolom my @loads = (); 1114bd335c64Sesolom foreach $ivec (@$ivecs) { 1115bd335c64Sesolom push(@loads, $tot); 1116bd335c64Sesolom $tot -= $ivec->{time}; 1117bd335c64Sesolom } 1118bd335c64Sesolom ($load, @goals) = do_find_goal($ivecs, \@loads, $goal, 0); 1119bd335c64Sesolom VERIFY($load >= $goal, "find_goal didn't meet goals"); 11205bb4956eSesolom } 1121bd335c64Sesolom syslog('debug', "goals found: %s", ivecs_to_string(@goals)); 1122bd335c64Sesolom 1123bd335c64Sesolom # Set or clear $ivec->{goal} for each ivec, based on returned @goals 1124bd335c64Sesolom 1125bd335c64Sesolom foreach $ivec (@$ivecs) { 1126bd335c64Sesolom if ($#goals > -1 && $ivec == $goals[0]) { 1127bd335c64Sesolom syslog('debug', "inum $ivec->{inum} on source cpu"); 1128bd335c64Sesolom $ivec->{goal} = 1; 1129bd335c64Sesolom shift(@goals); 1130bd335c64Sesolom } else { 1131bd335c64Sesolom syslog('debug', "inum $ivec->{inum} on target cpu"); 1132bd335c64Sesolom $ivec->{goal} = 0; 1133bd335c64Sesolom } 1134bd335c64Sesolom } 1135bd335c64Sesolom} 1136bd335c64Sesolom 1137bd335c64Sesolom 1138bd335c64Sesolomsub do_find_goal($$$$) # private function 1139bd335c64Sesolom{ 1140bd335c64Sesolom my ($ivecs, $loads, $goal, $idx) = @_; 1141bd335c64Sesolom 1142bd335c64Sesolom if ($idx > $#{$ivecs}) { 1143bd335c64Sesolom return (0); 1144bd335c64Sesolom } 1145bd335c64Sesolom syslog('debug', "$idx: finding goal $goal inum $ivecs->[$idx]{inum}"); 1146bd335c64Sesolom 1147bd335c64Sesolom my $load = $ivecs->[$idx]{time}; 1148bd335c64Sesolom my @goals_with = (); 1149bd335c64Sesolom my @goals_without = (); 1150bd335c64Sesolom my ($with, $without); 1151bd335c64Sesolom 1152bd335c64Sesolom # If we include all remaining items and we're still below goal, 1153bd335c64Sesolom # stop here. We can just return a result that includes $idx and all 1154bd335c64Sesolom # subsequent ivecs. Since this will still be below goal, there's 1155bd335c64Sesolom # nothing better to be done. 1156bd335c64Sesolom 1157bd335c64Sesolom if ($loads->[$idx] <= $goal) { 1158bd335c64Sesolom syslog('debug', 1159bd335c64Sesolom "$idx: including all remaining intrs %s with load %d", 1160bd335c64Sesolom ivecs_to_string(@$ivecs[$idx .. $#{$ivecs}]), 1161bd335c64Sesolom $loads->[$idx]); 1162bd335c64Sesolom return ($loads->[$idx], @$ivecs[$idx .. $#{$ivecs}]); 1163bd335c64Sesolom } 1164bd335c64Sesolom 1165bd335c64Sesolom # Evaluate the "with" option, i.e. the best matching goal which 1166bd335c64Sesolom # includes $ivecs->[$idx]. If idx's load is more than our goal load, 1167bd335c64Sesolom # stop here. Once we're above the goal, there is no need to consider 1168bd335c64Sesolom # further interrupts since they'll only take us further from the goal. 1169bd335c64Sesolom 1170bd335c64Sesolom if ($goal <= $load) { 1171bd335c64Sesolom $with = $load; # stop here 1172bd335c64Sesolom } else { 1173bd335c64Sesolom ($with, @goals_with) = 1174bd335c64Sesolom do_find_goal($ivecs, $loads, $goal - $load, $idx + 1); 1175bd335c64Sesolom $with += $load; 1176bd335c64Sesolom } 1177bd335c64Sesolom syslog('debug', "$idx: with-load $with intrs %s", 1178bd335c64Sesolom ivecs_to_string($ivecs->[$idx], @goals_with)); 1179bd335c64Sesolom 1180bd335c64Sesolom # Evaluate the "without" option, i.e. the best matching goal which 1181bd335c64Sesolom # excludes $ivecs->[$idx]. 1182bd335c64Sesolom 1183bd335c64Sesolom ($without, @goals_without) = 1184bd335c64Sesolom &do_find_goal($ivecs, $loads, $goal, $idx + 1); 1185bd335c64Sesolom syslog('debug', "$idx: without-load $without intrs %s", 1186bd335c64Sesolom ivecs_to_string(@goals_without)); 1187bd335c64Sesolom 1188bd335c64Sesolom # We now have our "with" and "without" options, and we choose which 1189bd335c64Sesolom # best fits the goal. If one is greater than goal and the other is 1190bd335c64Sesolom # below goal, we choose the one that is greater. If they are both 1191bd335c64Sesolom # below goal, then we choose the one that is greater. If they are 1192bd335c64Sesolom # both above goal, then we choose the smaller. 1193bd335c64Sesolom 1194bd335c64Sesolom my $which; # 0 == with, 1 == without 1195bd335c64Sesolom if ($with >= $goal && $without < $goal) { 1196bd335c64Sesolom $which = 0; 1197bd335c64Sesolom } elsif ($with < $goal && $without >= $goal) { 1198bd335c64Sesolom $which = 1; 1199bd335c64Sesolom } elsif ($with >= $goal && $without >= $goal) { 1200bd335c64Sesolom $which = ($without < $with); 1201bd335c64Sesolom } else { 1202bd335c64Sesolom $which = ($without > $with); 1203bd335c64Sesolom } 1204bd335c64Sesolom 1205bd335c64Sesolom # Return the load of our best case scenario, followed by all the ivecs 1206bd335c64Sesolom # which compose that goal. 1207bd335c64Sesolom 1208bd335c64Sesolom if ($which == 1) { # without 1209bd335c64Sesolom syslog('debug', "$idx: going without"); 1210bd335c64Sesolom return ($without, @goals_without); 1211bd335c64Sesolom } else { 1212bd335c64Sesolom syslog('debug', "$idx: going with"); 1213bd335c64Sesolom return ($with, $ivecs->[$idx], @goals_with); 1214bd335c64Sesolom } 1215bd335c64Sesolom # Not reached 1216bd335c64Sesolom} 1217bd335c64Sesolom 1218bd335c64Sesolom 1219bd335c64Sesolom 1220bd335c64Sesolom 1221bd335c64Sesolomsyslog('debug', "intrd is starting".($debug ? " (debug)" : "")); 1222bd335c64Sesolom 1223bd335c64Sesolommy @deltas = (); 1224bd335c64Sesolommy $deltas_tottime = 0; # sum of maxsnap-minsnap across @deltas 1225bd335c64Sesolommy $avggoodness; 1226bd335c64Sesolommy $baseline_goodness = 0; 1227bd335c64Sesolommy $compdelta; 1228bd335c64Sesolom 1229bd335c64Sesolommy $do_reconfig; 1230bd335c64Sesolom 1231bd335c64Sesolom# temp variables 1232bd335c64Sesolommy $goodness; 1233bd335c64Sesolommy $deltatime; 1234bd335c64Sesolommy $olddelta; 1235bd335c64Sesolommy $olddeltatime; 1236bd335c64Sesolommy $delta; 1237bd335c64Sesolommy $newstat; 1238bd335c64Sesolommy $below_statslen; 1239bd335c64Sesolommy $newtime; 1240bd335c64Sesolommy $ret; 1241bd335c64Sesolom 1242bd335c64Sesolom 1243bd335c64Sesolommy $gotsig = 0; 1244bd335c64Sesolom$SIG{INT} = sub { $gotsig = 1; }; # don't die in the middle of retargeting 1245bd335c64Sesolom$SIG{HUP} = $SIG{INT}; 1246bd335c64Sesolom$SIG{TERM} = $SIG{INT}; 1247bd335c64Sesolom 1248bd335c64Sesolommy $ks; 1249bd335c64Sesolomif ($using_scengen == 0) { 1250bd335c64Sesolom $ks = Sun::Solaris::Kstat->new(); 1251bd335c64Sesolom} else { 1252bd335c64Sesolom $ks = myks_update(); # supplied by the simulator 1253bd335c64Sesolom} 1254bd335c64Sesolom 1255bd335c64Sesolom# If no pci_intrs kstats were found, we need to exit, but we can't because 1256bd335c64Sesolom# SMF will restart us and/or report an error to the administrator. But 1257bd335c64Sesolom# there's nothing an administrator can do. So print out a message for SMF 1258bd335c64Sesolom# logs and silently pause forever. 1259bd335c64Sesolom 1260bd335c64Sesolomif (!exists($ks->{pci_intrs})) { 1261bd335c64Sesolom print STDERR "$cmdname: no interrupts were found; ". 1262bd335c64Sesolom "your PCI bus may not yet be supported\n"; 1263bd335c64Sesolom pause() while $gotsig == 0; 1264bd335c64Sesolom exit 0; 1265bd335c64Sesolom} 1266bd335c64Sesolom 12672917a9c9Sschwartz# See if this is a system with a pcplusmp APIC. 12682917a9c9Sschwartz# Such systems will get special handling. 12692917a9c9Sschwartz# Assume that if one bus has a pcplusmp APIC that they all do. 1270bd335c64Sesolom 12712917a9c9Sschwartz# Get a list of pci_intrs kstats. 12722917a9c9Sschwartzmy @elem = values(%{$ks->{pci_intrs}}); 12732917a9c9Sschwartzmy $elem0 = $elem[0]; 12742917a9c9Sschwartzmy $elemval = (values(%$elem0))[0]; 1275bd335c64Sesolom 12762917a9c9Sschwartz# Use its buspath to query the system. It is assumed that either all or none 12777ff178cdSJimmy Vetayases# of the busses on a system are hosted by the pcplusmp APIC or APIX. 12787ff178cdSJimmy Vetayasesmy $pcplusmp_sys = is_apic($elemval->{buspath}); 12792917a9c9Sschwartz 12802917a9c9Sschwartzmy $stat = getstat($ks, $pcplusmp_sys); 1281bd335c64Sesolom 1282bd335c64Sesolomfor (;;) { 1283bd335c64Sesolom sub clear_deltas { 1284bd335c64Sesolom @deltas = (); 1285bd335c64Sesolom $deltas_tottime = 0; 1286bd335c64Sesolom $stat = 0; # prevent next gen_delta() from setting {missing} 1287bd335c64Sesolom } 1288bd335c64Sesolom 1289bd335c64Sesolom # 1. Sleep, update the kstats, and save the new stats in $newstat. 1290bd335c64Sesolom 1291bd335c64Sesolom exit 0 if $gotsig; # if we got ^C / SIGTERM, exit 1292bd335c64Sesolom if ($using_scengen == 0) { 1293bd335c64Sesolom sleep($sleeptime); 1294bd335c64Sesolom exit 0 if $gotsig; # if we got ^C / SIGTERM, exit 1295bd335c64Sesolom $ks->update(); 1296bd335c64Sesolom } else { 1297bd335c64Sesolom $ks = myks_update(); 1298bd335c64Sesolom } 12992917a9c9Sschwartz $newstat = getstat($ks, $pcplusmp_sys); 1300bd335c64Sesolom 1301bd335c64Sesolom # $stat or $newstat could be zero if they're uninitialized, or if 1302bd335c64Sesolom # getstat() failed. If $stat is zero, move $newstat to $stat, sleep 1303bd335c64Sesolom # and try again. If $newstat is zero, then we also sleep and try 1304bd335c64Sesolom # again, hoping the problem will clear up. 1305bd335c64Sesolom 1306bd335c64Sesolom next if (!ref $newstat); 1307bd335c64Sesolom if (!ref $stat) { 1308bd335c64Sesolom $stat = $newstat; 1309bd335c64Sesolom next; 1310bd335c64Sesolom } 1311bd335c64Sesolom 1312bd335c64Sesolom # 2. Compare $newstat with the prior set of values, result in %$delta. 1313bd335c64Sesolom 1314bd335c64Sesolom $delta = generate_delta($stat, $newstat); 1315bd335c64Sesolom dumpdelta($delta) if $debug; # Dump most recent stats to stdout. 1316bd335c64Sesolom $stat = $newstat; # The new stats now become the old stats. 1317bd335c64Sesolom 1318bd335c64Sesolom 1319bd335c64Sesolom # 3. If $delta->{missing}, then there has been a reconfiguration of 1320bd335c64Sesolom # either cpus or interrupts (probably both). We need to toss out our 1321bd335c64Sesolom # old set of statistics and start from scratch. 1322bd335c64Sesolom # 1323bd335c64Sesolom # Also, if the delta covers a very long range of time, then we've 1324bd335c64Sesolom # been experiencing a system overload that has resulted in intrd 1325bd335c64Sesolom # not being allowed to run effectively for a while now. As above, 1326bd335c64Sesolom # toss our old statistics and start from scratch. 1327bd335c64Sesolom 1328bd335c64Sesolom $deltatime = $delta->{maxsnap} - $delta->{minsnap}; 1329bd335c64Sesolom if ($delta->{missing} > 0 || $deltatime > $statslen) { 1330bd335c64Sesolom clear_deltas(); 1331bd335c64Sesolom syslog('debug', "evaluating interrupt assignments"); 1332bd335c64Sesolom next; 1333bd335c64Sesolom } 1334bd335c64Sesolom 1335bd335c64Sesolom 1336bd335c64Sesolom # 4. Incorporate new delta into the list of deltas, and associated 1337bd335c64Sesolom # statistics. If we've just now received $statslen deltas, then it's 1338bd335c64Sesolom # time to evaluate a reconfiguration. 1339bd335c64Sesolom 1340bd335c64Sesolom $below_statslen = ($deltas_tottime < $statslen); 1341bd335c64Sesolom $deltas_tottime += $deltatime; 1342bd335c64Sesolom $do_reconfig = ($below_statslen && $deltas_tottime >= $statslen); 1343bd335c64Sesolom push(@deltas, $delta); 1344bd335c64Sesolom 1345bd335c64Sesolom # 5. Remove old deltas if total time is more than $statslen. We use 1346bd335c64Sesolom # @deltas as a moving average of the last $statslen seconds. Shift 1347bd335c64Sesolom # off the olders deltas, but only if that doesn't cause us to fall 1348bd335c64Sesolom # below $statslen seconds. 1349bd335c64Sesolom 1350bd335c64Sesolom while (@deltas > 1) { 1351bd335c64Sesolom $olddelta = $deltas[0]; 1352bd335c64Sesolom $olddeltatime = $olddelta->{maxsnap} - $olddelta->{minsnap}; 1353bd335c64Sesolom $newtime = $deltas_tottime - $olddeltatime; 1354bd335c64Sesolom last if ($newtime < $statslen); 1355bd335c64Sesolom 1356bd335c64Sesolom shift(@deltas); 1357bd335c64Sesolom $deltas_tottime = $newtime; 1358bd335c64Sesolom } 1359bd335c64Sesolom 1360bd335c64Sesolom # 6. The brains of the operation are here. First, check if we're 1361bd335c64Sesolom # imbalanced, and if so set $do_reconfig. If $do_reconfig is set, 1362bd335c64Sesolom # either because of imbalance or above in step 4, we evaluate a 1363bd335c64Sesolom # new configuration. 1364bd335c64Sesolom # 1365bd335c64Sesolom # First, take @deltas and generate a single "compressed" delta 1366bd335c64Sesolom # which summarizes them all. Pass that to do_reconfig and see 1367bd335c64Sesolom # what it does with it: 1368bd335c64Sesolom # 1369bd335c64Sesolom # $ret == -1 : failure 1370bd335c64Sesolom # $ret == 0 : current config is optimal (or close enough) 1371bd335c64Sesolom # $ret == 1 : reconfiguration has occurred 1372bd335c64Sesolom # 1373bd335c64Sesolom # If $ret is -1 or 1, dump all our deltas and start from scratch. 1374bd335c64Sesolom # Step 4 above will set do_reconfig soon thereafter. 1375bd335c64Sesolom # 1376bd335c64Sesolom # If $ret is 0, then nothing has happened because we're already 1377bd335c64Sesolom # good enough. Set baseline_goodness to current goodness. 1378bd335c64Sesolom 1379bd335c64Sesolom $compdelta = compress_deltas(\@deltas); 1380bd335c64Sesolom if (VERIFY(ref($compdelta) eq "HASH", "couldn't compress deltas")) { 1381bd335c64Sesolom clear_deltas(); 1382bd335c64Sesolom next; 1383bd335c64Sesolom } 1384bd335c64Sesolom $compdelta->{goodness} = goodness($compdelta); 1385bd335c64Sesolom dumpdelta($compdelta) if $debug; 1386bd335c64Sesolom 1387bd335c64Sesolom $goodness = $compdelta->{goodness}; 1388bd335c64Sesolom syslog('debug', "GOODNESS: %5.2f%%", $goodness * 100); 1389bd335c64Sesolom 1390bd335c64Sesolom if ($deltas_tottime >= $statslen && 1391bd335c64Sesolom imbalanced($goodness, $baseline_goodness)) { 1392bd335c64Sesolom $do_reconfig = 1; 1393bd335c64Sesolom } 1394bd335c64Sesolom 1395bd335c64Sesolom if ($do_reconfig) { 1396bd335c64Sesolom $ret = do_reconfig($compdelta); 1397bd335c64Sesolom 1398bd335c64Sesolom if ($ret != 0) { 1399bd335c64Sesolom clear_deltas(); 1400bd335c64Sesolom syslog('debug', "do_reconfig FAILED!") if $ret == -1; 1401bd335c64Sesolom } else { 1402bd335c64Sesolom syslog('debug', "setting new baseline of $goodness"); 1403bd335c64Sesolom $baseline_goodness = $goodness; 1404bd335c64Sesolom } 1405bd335c64Sesolom } 1406bd335c64Sesolom syslog('debug', "---------------------------------------"); 1407bd335c64Sesolom} 1408