1#!/usr/perl5/bin/perl 2# 3# CDDL HEADER START 4# 5# The contents of this file are subject to the terms of the 6# Common Development and Distribution License (the "License"). 7# You may not use this file except in compliance with the License. 8# 9# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10# or http://www.opensolaris.org/os/licensing. 11# See the License for the specific language governing permissions 12# and limitations under the License. 13# 14# When distributing Covered Code, include this CDDL HEADER in each 15# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16# If applicable, add the following below this CDDL HEADER, with the 17# fields enclosed by brackets "[]" replaced with your own identifying 18# information: Portions Copyright [yyyy] [name of copyright owner] 19# 20# CDDL HEADER END 21# 22 23# 24# Copyright 2007 Sun Microsystems, Inc. All rights reserved. 25# Use is subject to license terms. 26# 27#ident "%Z%%M% %I% %E% SMI" 28# 29 30require 5.6.1; 31use strict; 32use warnings; 33use POSIX; 34use File::Basename("basename"); 35 36my $cmdname = basename($0); 37 38my $using_scengen = 0; # 1 if using scenario simulator 39my $debug = 0; 40 41my $normal_sleeptime = 10; # time to sleep between samples 42my $idle_sleeptime = 45; # time to sleep when idle 43my $onecpu_sleeptime = (60 * 15); # used if only 1 CPU on system 44my $sleeptime = $normal_sleeptime; # either normal_ or idle_ or onecpu_ 45 46my $idle_intrload = .1; # idle if interrupt load < 10% 47 48my $timerange_toohi = .01; 49my $statslen = 60; # time period (in secs) to keep in @deltas 50 51 52# Parse arguments. intrd does not accept any public arguments; the two 53# arguments below are meant for testing purposes. -D generates a significant 54# amount of syslog output. -S <filename> loads the filename as a perl 55# script. That file is expected to implement a kstat "simulator" which 56# can be used to feed information to intrd and verify intrd's responses. 57 58while ($_ = shift @ARGV) { 59 if ($_ eq "-S" && $#ARGV != -1) { 60 $using_scengen = 1; 61 do $ARGV[0]; # load simulator 62 shift @ARGV; 63 } elsif ($_ eq "-D") { 64 $debug = 1; 65 } 66} 67 68if ($using_scengen == 0) { 69 require Sun::Solaris::Kstat; 70 require Sun::Solaris::Intrs; 71 import Sun::Solaris::Intrs(qw(intrmove is_pcplusmp)); 72 require Sys::Syslog; 73 import Sys::Syslog; 74 openlog($cmdname, 'pid', 'daemon'); 75 setlogmask(Sys::Syslog::LOG_UPTO($debug > 0 ? &Sys::Syslog::LOG_DEBUG : 76 &Sys::Syslog::LOG_INFO)); 77} 78 79my $asserted = 0; 80my $assert_level = 'debug'; # syslog level for assertion failures 81sub VERIFY($@) 82{ 83 my $bad = (shift() == 0); # $_[0] == 0 means assert failed 84 if ($bad) { 85 my $msg = shift(); 86 syslog($assert_level, "VERIFY: $msg", @_); 87 $asserted++; 88 } 89 return ($bad); 90} 91 92 93 94 95sub getstat($$); 96sub generate_delta($$); 97sub compress_deltas($); 98sub dumpdelta($); 99 100sub goodness($); 101sub imbalanced($$); 102sub do_reconfig($); 103 104sub goodness_cpu($$); # private function 105sub move_intr($$$$); # private function 106sub ivecs_to_string(@); # private function 107sub do_find_goal($$$$); # private function 108sub find_goal($$); # private function 109sub do_reconfig_cpu2cpu($$$$); # private function 110sub do_reconfig_cpu($$$); # private function 111 112 113# 114# What follow are the basic data structures routines of intrd. 115# 116# getstat() is responsible for reading the kstats and generating a "stat" hash. 117# 118# generate_delta() is responsible for taking two "stat" hashes and creating 119# a new "delta" hash that represents what has changed over time. 120# 121# compress_deltas() is responsible for taking a list of deltas and generating 122# a single delta hash that encompasses all the time periods described by the 123# deltas. 124 125 126# 127# getstat() is handed a reference to a kstat and generates a hash, returned 128# by reference, containing all the fields from the kstats which we need. 129# If it returns the scalar 0, it failed to gather the kstats, and the caller 130# should react accordingly. 131# 132# getstat() is also responsible for maintaining a reasonable $sleeptime. 133# 134# {"snaptime"} kstat's snaptime 135# {<cpuid>} one hash reference per online cpu 136# ->{"tot"} == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle} 137# ->{"crtime"} == cpu:<cpuid>:sys:crtime 138# ->{"ivecs"} 139# ->{<cookie#>} iterates over pci_intrs::<nexus>:cookie 140# ->{"time"} == pci_intrs:<ivec#>:<nexus>:time (in nsec) 141# ->{"pil"} == pci_intrs:<ivec#>:<nexus>:pil 142# ->{"crtime"} == pci_intrs:<ivec#>:<nexus>:crtime 143# ->{"ino"} == pci_intrs:<ivec#>:<nexus>:ino 144# ->{"num_ino"} == num inos of single device instance sharing this entry 145# Will be > 1 on pcplusmp X86 systems for devices 146# with multiple MSI interrupts. 147# ->{"buspath"} == pci_intrs:<ivec#>:<nexus>:buspath 148# ->{"name"} == pci_intrs:<ivec#>:<nexus>:name 149# ->{"ihs"} == pci_intrs:<ivec#>:<nexus>:ihs 150# 151 152sub getstat($$) 153{ 154 my ($ks, $pcplusmp_sys) = @_; 155 156 my $cpucnt = 0; 157 my %stat = (); 158 my ($minsnap, $maxsnap); 159 160 # Hash of hash which matches (MSI device, ino) combos to kstats. 161 my %msidevs = (); 162 163 # kstats are not generated atomically. Each kstat hierarchy will 164 # have been generated within the kernel at a different time. On a 165 # thrashing system, we may not run quickly enough in order to get 166 # coherent kstat timing information across all the kstats. To 167 # determine if this is occurring, $minsnap/$maxsnap are used to 168 # find the breadth between the first and last snaptime of all the 169 # kstats we access. $maxsnap - $minsnap roughly represents the 170 # total time taken up in getstat(). If this time approaches the 171 # time between snapshots, our results may not be useful. 172 173 $minsnap = -1; # snaptime is always a positive number 174 $maxsnap = $minsnap; 175 176 # Iterate over the cpus in cpu:<cpuid>::. Check 177 # cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the 178 # processor is "on-line". If not, it isn't accepting interrupts 179 # and doesn't concern us. 180 # 181 # Record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap. 182 183 while (my ($cpu, $cpst) = each %{$ks->{cpu}}) { 184 next if !exists($ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state}); 185 #"state" fld of kstat w/ 186 # modname inst name-"cpuinfo0" 187 my $state = $ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state}; 188 next if ($state !~ /^on-line\0/); 189 my $cpu_sys = $cpst->{sys}; 190 191 $stat{$cpu}{tot} = ($cpu_sys->{cpu_nsec_idle} + 192 $cpu_sys->{cpu_nsec_user} + 193 $cpu_sys->{cpu_nsec_kernel}); 194 $stat{$cpu}{crtime} = $cpu_sys->{crtime}; 195 $stat{$cpu}{ivecs} = {}; 196 197 if ($minsnap == -1 || $cpu_sys->{snaptime} < $minsnap) { 198 $minsnap = $cpu_sys->{snaptime}; 199 } 200 if ($cpu_sys->{snaptime} > $maxsnap) { 201 $maxsnap = $cpu_sys->{snaptime}; 202 } 203 $cpucnt++; 204 } 205 206 if ($cpucnt <= 1) { 207 $sleeptime = $onecpu_sleeptime; 208 return (0); # nothing to do with 1 CPU 209 } 210 211 # Iterate over the ivecs. If the cpu is not on-line, ignore the 212 # ivecs mapped to it, if any. 213 # 214 # Record pci_intrs:{inum}:<nexus>:time, snaptime, crtime, pil, 215 # ino, name, and buspath. Check $minsnap/$maxsnap. 216 217 foreach my $inst (values(%{$ks->{pci_intrs}})) { 218 my $intrcfg = (values(%$inst))[0]; 219 my $cpu = $intrcfg->{cpu}; 220 221 next unless exists $stat{$cpu}; 222 next if ($intrcfg->{type} =~ /^disabled\0/); 223 224 # Perl looks beyond NULL chars in pattern matching. 225 # Truncate name field at the first NULL 226 $intrcfg->{name} =~ s/\0.*$//; 227 228 if ($intrcfg->{snaptime} < $minsnap) { 229 $minsnap = $intrcfg->{snaptime}; 230 } elsif ($intrcfg->{snaptime} > $maxsnap) { 231 $maxsnap = $intrcfg->{snaptime}; 232 } 233 234 my $cookie = "$intrcfg->{buspath} $intrcfg->{ino}"; 235 if (exists $stat{$cpu}{ivecs}{$cookie}) { 236 my $cookiestats = $stat{$cpu}{ivecs}{$cookie}; 237 238 $cookiestats->{time} += $intrcfg->{time}; 239 $cookiestats->{name} .= "/$intrcfg->{name}"; 240 241 # If this new interrupt sharing $cookie represents a 242 # change from an earlier getstat, make sure that 243 # generate_delta will see the change by setting 244 # crtime to the most recent crtime of its components. 245 246 if ($intrcfg->{crtime} > $cookiestats->{crtime}) { 247 $cookiestats->{crtime} = $intrcfg->{crtime}; 248 } 249 $cookiestats->{ihs}++; 250 next; 251 } 252 $stat{$cpu}{ivecs}{$cookie}{time} = $intrcfg->{time}; 253 $stat{$cpu}{ivecs}{$cookie}{crtime} = $intrcfg->{crtime}; 254 $stat{$cpu}{ivecs}{$cookie}{pil} = $intrcfg->{pil}; 255 $stat{$cpu}{ivecs}{$cookie}{ino} = $intrcfg->{ino}; 256 $stat{$cpu}{ivecs}{$cookie}{num_ino} = 1; 257 $stat{$cpu}{ivecs}{$cookie}{buspath} = $intrcfg->{buspath}; 258 $stat{$cpu}{ivecs}{$cookie}{name} = $intrcfg->{name}; 259 $stat{$cpu}{ivecs}{$cookie}{ihs} = 1; 260 261 if ($pcplusmp_sys && ($intrcfg->{type} =~ /^msi\0/)) { 262 if (!(exists($msidevs{$intrcfg->{name}}))) { 263 $msidevs{$intrcfg->{name}} = {}; 264 } 265 $msidevs{$intrcfg->{name}}{$intrcfg->{ino}} = 266 \$stat{$cpu}{ivecs}{$cookie}; 267 } 268 } 269 270 # All MSI interrupts of a device instance share a single MSI address. 271 # On X86 systems with an APIC, this MSI address is interpreted as CPU 272 # routing info by the APIC. For this reason, on these platforms, all 273 # interrupts for MSI devices must be moved to the same CPU at the same 274 # time. 275 # 276 # Since all interrupts will be on the same CPU on these platforms, all 277 # interrupts can be consolidated into one ivec entry. For such devices, 278 # num_ino will be > 1 to denote that a group move is needed. 279 280 # Loop thru all MSI devices on X86 pcplusmp systems. 281 # Nop on other systems. 282 foreach my $msidevkey (sort keys %msidevs) { 283 284 # Loop thru inos of the device, sorted by lowest value first 285 # For each cookie found for a device, incr num_ino for the 286 # lowest cookie and remove other cookies. 287 288 # Assumes PIL is the same for first and current cookies 289 290 my $first_ino = -1; 291 my $first_cookiep; 292 my $curr_cookiep; 293 foreach my $inokey (sort keys %{$msidevs{$msidevkey}}) { 294 $curr_cookiep = $msidevs{$msidevkey}{$inokey}; 295 if ($first_ino == -1) { 296 $first_ino = $inokey; 297 $first_cookiep = $curr_cookiep; 298 } else { 299 $$first_cookiep->{num_ino}++; 300 $$first_cookiep->{time} += 301 $$curr_cookiep->{time}; 302 if ($$curr_cookiep->{crtime} > 303 $$first_cookiep->{crtime}) { 304 $$first_cookiep->{crtime} = 305 $$curr_cookiep->{crtime}; 306 } 307 # Invalidate this cookie, less complicated and 308 # more efficient than deleting it. 309 $$curr_cookiep->{num_ino} = 0; 310 } 311 } 312 } 313 314 # We define the timerange as the amount of time spent gathering the 315 # various kstats, divided by our sleeptime. If we take a lot of time 316 # to access the kstats, and then we create a delta comparing these 317 # kstats with a prior set of kstats, that delta will cover 318 # substaintially different amount of time depending upon which 319 # interrupt or CPU is being examined. 320 # 321 # By checking the timerange here, we guarantee that any deltas 322 # created from these kstats will contain self-consistent data, 323 # in that all CPUs and interrupts cover a similar span of time. 324 # 325 # $timerange_toohi is the upper bound. Any timerange above 326 # this is thrown out as garbage. If the stat is safely within this 327 # bound, we treat the stat as representing an instant in time, rather 328 # than the time range it actually spans. We arbitrarily choose minsnap 329 # as the snaptime of the stat. 330 331 $stat{snaptime} = $minsnap; 332 my $timerange = ($maxsnap - $minsnap) / $sleeptime; 333 return (0) if ($timerange > $timerange_toohi); # i.e. failure 334 return (\%stat); 335} 336 337# 338# dumpdelta takes a reference to our "delta" structure: 339# {"missing"} "1" if the delta's component stats had inconsistencies 340# {"minsnap"} time of the first kstat snaptime used in this delta 341# {"maxsnap"} time of the last kstat snaptime used in this delta 342# {"goodness"} cost function applied to this delta 343# {"avgintrload"} avg of interrupt load across cpus, as a percentage 344# {"avgintrnsec"} avg number of nsec spent in interrupts, per cpu 345# {<cpuid>} iterates over on-line cpus 346# ->{"intrs"} cpu's movable intr time (sum of "time" for each ivec) 347# ->{"tot"} CPU load from all sources in nsec 348# ->{"bigintr"} largest value of {ivecs}{<ivec#>}{time} from below 349# ->{"intrload"} intrs / tot 350# ->{"ivecs"} 351# ->{<ivec#>} iterates over ivecs for this cpu 352# ->{"time"} time used by this interrupt (in nsec) 353# ->{"pil"} pil level of this interrupt 354# ->{"ino"} interrupt number (or base vector if MSI group) 355# ->{"buspath"} filename of the directory of the device's bus 356# ->{"name"} device name 357# ->{"ihs"} number of different handlers sharing this ino 358# ->{"num_ino"} number of interrupt vectors in MSI group 359# 360# It prints out the delta structure in a nice, human readable display. 361# 362 363sub dumpdelta($) 364{ 365 my ($delta) = @_; 366 367 # print global info 368 369 syslog('debug', "dumpdelta:"); 370 syslog('debug', " RECONFIGURATION IN DELTA") if $delta->{missing} > 0; 371 syslog('debug', " avgintrload: %5.2f%% avgintrnsec: %d", 372 $delta->{avgintrload} * 100, $delta->{avgintrnsec}); 373 syslog('debug', " goodness: %5.2f%%", $delta->{goodness} * 100) 374 if exists($delta->{goodness}); 375 376 # iterate over cpus 377 378 while (my ($cpu, $cpst) = each %$delta) { 379 next if !ref($cpst); # skip non-cpuid entries 380 my $tot = $cpst->{tot}; 381 syslog('debug', " cpu %3d intr %7.3f%% (bigintr %7.3f%%)", 382 $cpu, $cpst->{intrload}*100, $cpst->{bigintr}*100/$tot); 383 syslog('debug', " intrs %d, bigintr %d", 384 $cpst->{intrs}, $cpst->{bigintr}); 385 386 # iterate over ivecs on this cpu 387 388 while (my ($ivec, $ivst) = each %{$cpst->{ivecs}}) { 389 syslog('debug', " %15s:\"%s\": %7.3f%% %d", 390 ($ivst->{ihs} > 1 ? "$ivst->{name}($ivst->{ihs})" : 391 $ivst->{name}), $ivec, 392 $ivst->{time}*100 / $tot, $ivst->{time}); 393 } 394 } 395} 396 397# 398# generate_delta($stat, $newstat) takes two stat references, returned from 399# getstat(), and creates a %delta. %delta (not surprisingly) contains the 400# same basic info as stat and newstat, but with the timestamps as deltas 401# instead of absolute times. We return a reference to the delta. 402# 403 404sub generate_delta($$) 405{ 406 my ($stat, $newstat) = @_; 407 408 my %delta = (); 409 my $intrload; 410 my $intrnsec; 411 my $cpus; 412 413 # Take the worstcase timerange 414 $delta{minsnap} = $stat->{snaptime}; 415 $delta{maxsnap} = $newstat->{snaptime}; 416 if (VERIFY($delta{maxsnap} > $delta{minsnap}, 417 "generate_delta: stats aren't ascending")) { 418 $delta{missing} = 1; 419 return (\%delta); 420 } 421 422 # if there are a different number of cpus in the stats, set missing 423 424 $delta{missing} = (keys(%$stat) != keys(%$newstat)); 425 if (VERIFY($delta{missing} == 0, 426 "generate_delta: number of CPUs changed")) { 427 return (\%delta); 428 } 429 430 # scan through every cpu in %newstat and compare against %stat 431 432 while (my ($cpu, $newcpst) = each %$newstat) { 433 next if !ref($newcpst); # skip non-cpuid fields 434 435 # If %stat is missing a cpu from %newstat, then it was just 436 # onlined. Mark missing. 437 438 if (VERIFY(exists $stat->{$cpu} && 439 $stat->{$cpu}{crtime} == $newcpst->{crtime}, 440 "generate_delta: cpu $cpu changed")) { 441 $delta{missing} = 1; 442 return (\%delta); 443 } 444 my $cpst = $stat->{$cpu}; 445 $delta{$cpu}{tot} = $newcpst->{tot} - $cpst->{tot}; 446 if (VERIFY($delta{$cpu}{tot} >= 0, 447 "generate_delta: deltas are not ascending?")) { 448 $delta{missing} = 1; 449 delete($delta{$cpu}); 450 return (\%delta); 451 } 452 # Avoid remote chance of division by zero 453 $delta{$cpu}{tot} = 1 if $delta{$cpu}{tot} == 0; 454 $delta{$cpu}{intrs} = 0; 455 $delta{$cpu}{bigintr} = 0; 456 457 my %ivecs = (); 458 $delta{$cpu}{ivecs} = \%ivecs; 459 460 # if the number of ivecs differs, set missing 461 462 if (VERIFY(keys(%{$cpst->{ivecs}}) == 463 keys(%{$newcpst->{ivecs}}), 464 "generate_delta: cpu $cpu has more/less". 465 " interrupts")) { 466 $delta{missing} = 1; 467 return (\%delta); 468 } 469 470 while (my ($inum, $newivec) = each %{$newcpst->{ivecs}}) { 471 472 # Unused cookie, corresponding to an MSI vector which 473 # is part of a group. The whole group is accounted for 474 # by a different cookie. 475 next if ($newivec->{num_ino} == 0); 476 477 # If this ivec doesn't exist in $stat, or if $stat 478 # shows a different crtime, set missing. 479 if (VERIFY(exists $cpst->{ivecs}{$inum} && 480 $cpst->{ivecs}{$inum}{crtime} == 481 $newivec->{crtime}, 482 "generate_delta: cpu $cpu inum $inum". 483 " has changed")) { 484 $delta{missing} = 1; 485 return (\%delta); 486 } 487 my $ivec = $cpst->{ivecs}{$inum}; 488 489 # Create $delta{$cpu}{ivecs}{$inum}. 490 491 my %dltivec = (); 492 $delta{$cpu}{ivecs}{$inum} = \%dltivec; 493 494 # calculate time used by this interrupt 495 496 my $time = $newivec->{time} - $ivec->{time}; 497 if (VERIFY($time >= 0, 498 "generate_delta: ivec went backwards?")) { 499 $delta{missing} = 1; 500 delete($delta{$cpu}{ivecs}{$inum}); 501 return (\%delta); 502 } 503 $delta{$cpu}{intrs} += $time; 504 $dltivec{time} = $time; 505 if ($time > $delta{$cpu}{bigintr}) { 506 $delta{$cpu}{bigintr} = $time; 507 } 508 509 # Transfer over basic info about the kstat. We 510 # don't have to worry about discrepancies between 511 # ivec and newivec because we verified that both 512 # have the same crtime. 513 514 $dltivec{pil} = $newivec->{pil}; 515 $dltivec{ino} = $newivec->{ino}; 516 $dltivec{buspath} = $newivec->{buspath}; 517 $dltivec{name} = $newivec->{name}; 518 $dltivec{ihs} = $newivec->{ihs}; 519 $dltivec{num_ino} = $newivec->{num_ino}; 520 } 521 if ($delta{$cpu}{tot} < $delta{$cpu}{intrs}) { 522 # Ewww! Hopefully just a rounding error. 523 # Make something up. 524 $delta{$cpu}{tot} = $delta{$cpu}{intrs}; 525 } 526 $delta{$cpu}{intrload} = 527 $delta{$cpu}{intrs} / $delta{$cpu}{tot}; 528 $intrload += $delta{$cpu}{intrload}; 529 $intrnsec += $delta{$cpu}{intrs}; 530 $cpus++; 531 } 532 if ($cpus > 0) { 533 $delta{avgintrload} = $intrload / $cpus; 534 $delta{avgintrnsec} = $intrnsec / $cpus; 535 } else { 536 $delta{avgintrload} = 0; 537 $delta{avgintrnsec} = 0; 538 } 539 return (\%delta); 540} 541 542 543# compress_delta takes a list of deltas, and returns a single new delta 544# which represents the combined information from all the deltas. The deltas 545# provided are assumed to be sequential in time. The resulting compressed 546# delta looks just like any other delta. This new delta is also more accurate 547# since its statistics are averaged over a longer period than any of the 548# original deltas. 549 550sub compress_deltas ($) 551{ 552 my ($deltas) = @_; 553 554 my %newdelta = (); 555 my ($intrs, $tot); 556 my $cpus = 0; 557 my ($high_intrload) = 0; 558 559 if (VERIFY($#$deltas != -1, 560 "compress_deltas: list of delta is empty?")) { 561 return (0); 562 } 563 $newdelta{minsnap} = $deltas->[0]{minsnap}; 564 $newdelta{maxsnap} = $deltas->[$#$deltas]{maxsnap}; 565 $newdelta{missing} = 0; 566 567 foreach my $delta (@$deltas) { 568 if (VERIFY($delta->{missing} == 0, 569 "compressing bad deltas?")) { 570 return (0); 571 } 572 while (my ($cpuid, $cpu) = each %$delta) { 573 next if !ref($cpu); 574 575 $intrs += $cpu->{intrs}; 576 $tot += $cpu->{tot}; 577 $newdelta{$cpuid}{intrs} += $cpu->{intrs}; 578 $newdelta{$cpuid}{tot} += $cpu->{tot}; 579 if (!exists $newdelta{$cpuid}{ivecs}) { 580 my %ivecs = (); 581 $newdelta{$cpuid}{ivecs} = \%ivecs; 582 } 583 while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) { 584 my $newivecs = $newdelta{$cpuid}{ivecs}; 585 $newivecs->{$inum}{time} += $ivec->{time}; 586 $newivecs->{$inum}{pil} = $ivec->{pil}; 587 $newivecs->{$inum}{ino} = $ivec->{ino}; 588 $newivecs->{$inum}{buspath} = $ivec->{buspath}; 589 $newivecs->{$inum}{name} = $ivec->{name}; 590 $newivecs->{$inum}{ihs} = $ivec->{ihs}; 591 $newivecs->{$inum}{num_ino} = $ivec->{num_ino}; 592 } 593 } 594 } 595 foreach my $cpu (values(%newdelta)) { 596 next if !ref($cpu); # ignore non-cpu fields 597 $cpus++; 598 599 my $bigintr = 0; 600 foreach my $ivec (values(%{$cpu->{ivecs}})) { 601 if ($ivec->{time} > $bigintr) { 602 $bigintr = $ivec->{time}; 603 } 604 } 605 $cpu->{bigintr} = $bigintr; 606 $cpu->{intrload} = $cpu->{intrs} / $cpu->{tot}; 607 if ($high_intrload < $cpu->{intrload}) { 608 $high_intrload = $cpu->{intrload}; 609 } 610 $cpu->{tot} = 1 if $cpu->{tot} <= 0; 611 } 612 if ($cpus == 0) { 613 $newdelta{avgintrnsec} = 0; 614 $newdelta{avgintrload} = 0; 615 } else { 616 $newdelta{avgintrnsec} = $intrs / $cpus; 617 $newdelta{avgintrload} = $intrs / $tot; 618 } 619 $sleeptime = ($high_intrload < $idle_intrload) ? $idle_sleeptime : 620 $normal_sleeptime; 621 return (\%newdelta); 622} 623 624 625 626 627 628# What follow are the core functions responsible for examining the deltas 629# generated above and deciding what to do about them. 630# 631# goodness() and its helper goodness_cpu() return a heuristic which describe 632# how good (or bad) the current interrupt balance is. The value returned will 633# be between 0 and 1, with 0 representing maximum goodness, and 1 representing 634# maximum badness. 635# 636# imbalanced() compares a current and historical value of goodness, and 637# determines if there has been enough change to warrant evaluating a 638# reconfiguration of the interrupts 639# 640# do_reconfig(), and its helpers, do_reconfig_cpu(), do_reconfig_cpu2cpu(), 641# find_goal(), do_find_goal(), and move_intr(), are responsible for examining 642# a delta and determining the best possible assignment of interrupts to CPUs. 643# 644# It is important that do_reconfig() be in alignment with goodness(). If 645# do_reconfig were to generate a new interrupt distribution that worsened 646# goodness, we could get into a pathological loop with intrd fighting itself, 647# constantly deciding that things are imbalanced, and then changing things 648# only to make them worse. 649 650 651 652# any goodness over $goodness_unsafe_load is considered really bad 653# goodness must drop by at least $goodness_mindelta for a reconfig 654 655my $goodness_unsafe_load = .9; 656my $goodness_mindelta = .1; 657 658# goodness(%delta) examines a delta and return its "goodness". goodness will 659# be between 0 (best) and 1 (major bad). goodness is determined by evaluating 660# the goodness of each individual cpu, and returning the worst case. This 661# helps on systems with many CPUs, where otherwise a single pathological CPU 662# might otherwise be ignored because the average was OK. 663# 664# To calculate the goodness of an individual CPU, we start by looking at its 665# load due to interrupts. If the load is above a certain high threshold and 666# there is more than one interrupt assigned to this CPU, we set goodness 667# to worst-case. If the load is below the average interrupt load of all CPUs, 668# then we return best-case, since what's to complain about? 669# 670# Otherwise we look at how much the load is above the average, and return 671# that as the goodness, with one caveat: we never return more than the CPU's 672# interrupt load ignoring its largest single interrupt source. This is 673# because a CPU with one high-load interrupt, and no other interrupts, is 674# perfectly balanced. Nothing can be done to improve the situation, and thus 675# it is perfectly balanced even if the interrupt's load is 100%. 676 677sub goodness($) 678{ 679 my ($delta) = @_; 680 681 return (1) if $delta->{missing} > 0; 682 683 my $high_goodness = 0; 684 my $goodness; 685 686 foreach my $cpu (values(%$delta)) { 687 next if !ref($cpu); # skip non-cpuid fields 688 689 $goodness = goodness_cpu($cpu, $delta->{avgintrload}); 690 if (VERIFY($goodness >= 0 && $goodness <= 1, 691 "goodness: cpu goodness out of range?")) { 692 dumpdelta($delta); 693 return (1); 694 } 695 if ($goodness == 1) { 696 return (1); # worst case, no need to continue 697 } 698 if ($goodness > $high_goodness) { 699 $high_goodness = $goodness; 700 } 701 } 702 return ($high_goodness); 703} 704 705sub goodness_cpu($$) # private function 706{ 707 my ($cpu, $avgintrload) = @_; 708 709 my $goodness; 710 my $load = $cpu->{intrs} / $cpu->{tot}; 711 712 return (0) if ($load < $avgintrload); # low loads are perfectly good 713 714 # Calculate $load_no_bigintr, which represents the load 715 # due to interrupts, excluding the one biggest interrupt. 716 # This is the most gain we can get on this CPU from 717 # offloading interrupts. 718 719 my $load_no_bigintr = ($cpu->{intrs} - $cpu->{bigintr}) / $cpu->{tot}; 720 721 # A major imbalance is indicated if a CPU is saturated 722 # with interrupt handling, and it has more than one 723 # source of interrupts. Those other interrupts could be 724 # starved if of a lower pil. Return a goodness of 1, 725 # which is the worst possible return value, 726 # which will effectively contaminate this entire delta. 727 728 my $cnt = keys(%{$cpu->{ivecs}}); 729 730 if ($load > $goodness_unsafe_load && $cnt > 1) { 731 return (1); 732 } 733 $goodness = $load - $avgintrload; 734 if ($goodness > $load_no_bigintr) { 735 $goodness = $load_no_bigintr; 736 } 737 return ($goodness); 738} 739 740 741# imbalanced() is used by the main routine to determine if the goodness 742# has shifted far enough from our last baseline to warrant a reassignment 743# of interrupts. A very high goodness indicates that a CPU is way out of 744# whack. If the goodness has varied too much since the baseline, then 745# perhaps a reconfiguration is worth considering. 746 747sub imbalanced ($$) 748{ 749 my ($goodness, $baseline) = @_; 750 751 # Return 1 if we are pathological, or creeping away from the baseline 752 753 return (1) if $goodness > .50; 754 return (1) if abs($goodness - $baseline) > $goodness_mindelta; 755 return (0); 756} 757 758# do_reconfig(), do_reconfig_cpu(), and do_reconfig_cpu2cpu(), are the 759# decision-making functions responsible for generating a new interrupt 760# distribution. They are designed with the definition of goodness() in 761# mind, i.e. they use the same definition of "good distribution" as does 762# goodness(). 763# 764# do_reconfig() is responsible for deciding whether a redistribution is 765# actually warranted. If the goodness is already pretty good, it doesn't 766# waste the CPU time to generate a new distribution. If it 767# calculates a new distribution and finds that it is not sufficiently 768# improved from the prior distirbution, it will not do the redistribution, 769# mainly to avoid the disruption to system performance caused by 770# rejuggling interrupts. 771# 772# Its main loop works by going through a list of cpus sorted from 773# highest to lowest interrupt load. It removes the highest-load cpus 774# one at a time and hands them off to do_reconfig_cpu(). This function 775# then re-sorts the remaining CPUs from lowest to highest interrupt load, 776# and one at a time attempts to rejuggle interrupts between the original 777# high-load CPU and the low-load CPU. Rejuggling on a high-load CPU is 778# considered finished as soon as its interrupt load is within 779# $goodness_mindelta of the average interrupt load. Such a CPU will have 780# a goodness of below the $goodness_mindelta threshold. 781 782# 783# move_intr(\%delta, $inum, $oldcpu, $newcpu) 784# used by reconfiguration code to move an interrupt between cpus within 785# a delta. This manipulates data structures, and does not actually move 786# the interrupt on the running system. 787# 788sub move_intr($$$$) # private function 789{ 790 my ($delta, $inum, $oldcpuid, $newcpuid) = @_; 791 792 my $ivec = $delta->{$oldcpuid}{ivecs}{$inum}; 793 794 # Remove ivec from old cpu 795 796 my $oldcpu = $delta->{$oldcpuid}; 797 $oldcpu->{intrs} -= $ivec->{time}; 798 $oldcpu->{intrload} = $oldcpu->{intrs} / $oldcpu->{tot}; 799 delete($oldcpu->{ivecs}{$inum}); 800 801 VERIFY($oldcpu->{intrs} >= 0, "move_intr: intr's time > total time?"); 802 VERIFY($ivec->{time} <= $oldcpu->{bigintr}, 803 "move_intr: intr's time > bigintr?"); 804 805 if ($ivec->{time} >= $oldcpu->{bigintr}) { 806 my $bigtime = 0; 807 808 foreach my $ivec (values(%{$oldcpu->{ivecs}})) { 809 $bigtime = $ivec->{time} if $ivec->{time} > $bigtime; 810 } 811 $oldcpu->{bigintr} = $bigtime; 812 } 813 814 # Add ivec onto new cpu 815 816 my $newcpu = $delta->{$newcpuid}; 817 818 $ivec->{nowcpu} = $newcpuid; 819 $newcpu->{intrs} += $ivec->{time}; 820 $newcpu->{intrload} = $newcpu->{intrs} / $newcpu->{tot}; 821 $newcpu->{ivecs}{$inum} = $ivec; 822 823 $newcpu->{bigintr} = $ivec->{time} 824 if $ivec->{time} > $newcpu->{bigintr}; 825} 826 827sub move_intr_check($$$) # private function 828{ 829 my ($delta, $oldcpuid, $newcpuid) = @_; 830 831 VERIFY($delta->{$oldcpuid}{tot} >= $delta->{$oldcpuid}{intrs}, 832 "Moved interrupts left 100+%% load on src cpu"); 833 VERIFY($delta->{$newcpuid}{tot} >= $delta->{$newcpuid}{intrs}, 834 "Moved interrupts left 100+%% load on tgt cpu"); 835} 836 837sub ivecs_to_string(@) # private function 838{ 839 my $str = ""; 840 foreach my $ivec (@_) { 841 $str = "$str $ivec->{inum}"; 842 } 843 return ($str); 844} 845 846 847sub do_reconfig($) 848{ 849 my ($delta) = @_; 850 851 my $goodness = $delta->{goodness}; 852 853 # We can't improve goodness to better than 0. We should stop here 854 # if, even if we achieve a goodness of 0, the improvement is still 855 # too small to merit the action. 856 857 if ($goodness - 0 < $goodness_mindelta) { 858 syslog('debug', "goodness good enough, don't reconfig"); 859 return (0); 860 } 861 862 syslog('notice', "Optimizing interrupt assignments"); 863 864 if (VERIFY ($delta->{missing} == 0, "RECONFIG Aborted: should not ". 865 "have a delta with missing")) { 866 return (-1); 867 } 868 869 # Make a list of all cpuids, and also add some extra information 870 # to the ivec structures. 871 872 my @cpusortlist = (); 873 874 while (my ($cpuid, $cpu) = each %$delta) { 875 next if !ref($cpu); # skip non-cpu entries 876 877 push(@cpusortlist, $cpuid); 878 while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) { 879 $ivec->{origcpu} = $cpuid; 880 $ivec->{nowcpu} = $cpuid; 881 $ivec->{inum} = $inum; 882 } 883 } 884 885 # Sort the list of CPUs from highest to lowest interrupt load. 886 # Remove the top CPU from that list and attempt to redistribute 887 # its interrupts. If the CPU has a goodness below a threshold, 888 # just ignore the CPU and move to the next one. If the CPU's 889 # load falls below the average load plus that same threshold, 890 # then there are no CPUs left worth reconfiguring, and we're done. 891 892 while (@cpusortlist) { 893 # Re-sort cpusortlist each time, since do_reconfig_cpu can 894 # move interrupts around. 895 896 @cpusortlist = 897 sort({$delta->{$b}{intrload} <=> $delta->{$a}{intrload}} 898 @cpusortlist); 899 900 my $cpu = shift(@cpusortlist); 901 if (($delta->{$cpu}{intrload} <= $goodness_unsafe_load) && 902 ($delta->{$cpu}{intrload} <= 903 $delta->{avgintrload} + $goodness_mindelta)) { 904 syslog('debug', "finished reconfig: cpu $cpu load ". 905 "$delta->{$cpu}{intrload} avgload ". 906 "$delta->{avgintrload}"); 907 last; 908 } 909 if (goodness_cpu($delta->{$cpu}, $delta->{avgintrload}) < 910 $goodness_mindelta) { 911 next; 912 } 913 do_reconfig_cpu($delta, \@cpusortlist, $cpu); 914 } 915 916 # How good a job did we do? If the improvement was minimal, and 917 # our goodness wasn't pathological (and thus needing any help it 918 # can get), then don't bother moving the interrupts. 919 920 my $newgoodness = goodness($delta); 921 VERIFY($newgoodness <= $goodness, 922 "reconfig: result has worse goodness?"); 923 924 if (($goodness != 1 || $newgoodness == 1) && 925 $goodness - $newgoodness < $goodness_mindelta) { 926 syslog('debug', "goodness already near optimum, ". 927 "don't reconfig"); 928 return (0); 929 } 930 syslog('debug', "goodness %5.2f%% --> %5.2f%%", $goodness*100, 931 $newgoodness*100); 932 933 # Time to move those interrupts! 934 935 my $ret = 1; 936 my $warned = 0; 937 while (my ($cpuid, $cpu) = each %$delta) { 938 next if $cpuid =~ /\D/; 939 while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) { 940 next if ($ivec->{origcpu} == $cpuid); 941 942 if (!intrmove($ivec->{buspath}, $ivec->{ino}, 943 $cpuid, $ivec->{num_ino})) { 944 syslog('warning', "Unable to move interrupts") 945 if $warned++ == 0; 946 syslog('debug', "Unable to move buspath ". 947 "$ivec->{buspath} ino $ivec->{ino} to ". 948 "cpu $cpuid"); 949 $ret = -1; 950 } 951 } 952 } 953 954 syslog('notice', "Interrupt assignments optimized"); 955 return ($ret); 956} 957 958sub do_reconfig_cpu($$$) # private function 959{ 960 my ($delta, $cpusortlist, $oldcpuid) = @_; 961 962 # We have been asked to rejuggle interrupts between $oldcpuid and 963 # other CPUs found on $cpusortlist so as to improve the load on 964 # $oldcpuid. We reverse $cpusortlist to get our own copy of the 965 # list, sorted from lowest to highest interrupt load. One at a 966 # time, shift a CPU off of this list of CPUs, and attempt to 967 # rejuggle interrupts between the two CPUs. Don't do this if the 968 # other CPU has a higher load than oldcpuid. We're done rejuggling 969 # once $oldcpuid's goodness falls below a threshold. 970 971 syslog('debug', "reconfiguring $oldcpuid"); 972 973 my $cpu = $delta->{$oldcpuid}; 974 my $avgintrload = $delta->{avgintrload}; 975 976 my @cputargetlist = reverse(@$cpusortlist); # make a copy of the list 977 while ($#cputargetlist != -1) { 978 last if goodness_cpu($cpu, $avgintrload) < $goodness_mindelta; 979 980 my $tgtcpuid = shift(@cputargetlist); 981 my $tgt = $delta->{$tgtcpuid}; 982 my $load = $cpu->{intrload}; 983 my $tgtload = $tgt->{intrload}; 984 last if $tgtload > $load; 985 do_reconfig_cpu2cpu($delta, $oldcpuid, $tgtcpuid, $load); 986 } 987} 988 989sub do_reconfig_cpu2cpu($$$$) # private function 990{ 991 my ($delta, $srccpuid, $tgtcpuid, $srcload) = @_; 992 993 # We've been asked to consider interrupt juggling between srccpuid 994 # (with a high interrupt load) and tgtcpuid (with a lower interrupt 995 # load). First, make a single list with all of the ivecs from both 996 # CPUs, and sort the list from highest to lowest load. 997 998 syslog('debug', "exchanging intrs between $srccpuid and $tgtcpuid"); 999 1000 # Gather together all the ivecs and sort by load 1001 1002 my @ivecs = (values(%{$delta->{$srccpuid}{ivecs}}), 1003 values(%{$delta->{$tgtcpuid}{ivecs}})); 1004 return if $#ivecs == -1; 1005 1006 @ivecs = sort({$b->{time} <=> $a->{time}} @ivecs); 1007 1008 # Our "goal" load for srccpuid is the average load across all CPUs. 1009 # find_goal() will find determine the optimum selection of the 1010 # available interrupts which comes closest to this goal without 1011 # falling below the goal. 1012 1013 my $goal = $delta->{avgintrnsec}; 1014 1015 # We know that the interrupt load on tgtcpuid is less than that on 1016 # srccpuid, but its load could still be above avgintrnsec. Don't 1017 # choose a goal which would bring srccpuid below the load on tgtcpuid. 1018 1019 my $avgnsec = 1020 ($delta->{$srccpuid}{intrs} + $delta->{$tgtcpuid}{intrs}) / 2; 1021 if ($goal < $avgnsec) { 1022 $goal = $avgnsec; 1023 } 1024 1025 # If the largest of the interrupts is on srccpuid, leave it there. 1026 # This can help minimize the disruption caused by moving interrupts. 1027 1028 if ($ivecs[0]->{origcpu} == $srccpuid) { 1029 syslog('debug', "Keeping $ivecs[0]->{inum} on $srccpuid"); 1030 $goal -= $ivecs[0]->{time}; 1031 shift(@ivecs); 1032 } 1033 1034 syslog('debug', "GOAL: inums should total $goal"); 1035 find_goal(\@ivecs, $goal); 1036 1037 # find_goal() returned its results to us by setting $ivec->{goal} if 1038 # the ivec should be on srccpuid, or clearing it for tgtcpuid. 1039 # Call move_intr() to update our $delta with the new results. 1040 1041 foreach my $ivec (@ivecs) { 1042 syslog('debug', "ivec $ivec->{inum} goal $ivec->{goal}"); 1043 VERIFY($ivec->{nowcpu} == $srccpuid || 1044 $ivec->{nowcpu} == $tgtcpuid, "cpu2cpu found an ". 1045 "interrupt not currently on src or tgt cpu"); 1046 1047 if ($ivec->{goal} && $ivec->{nowcpu} != $srccpuid) { 1048 move_intr($delta, $ivec->{inum}, $ivec->{nowcpu}, 1049 $srccpuid); 1050 } elsif ($ivec->{goal} == 0 && $ivec->{nowcpu} != $tgtcpuid) { 1051 move_intr($delta, $ivec->{inum}, $ivec->{nowcpu}, 1052 $tgtcpuid); 1053 } 1054 } 1055 move_intr_check($delta, $srccpuid, $tgtcpuid); # asserts 1056 1057 my $newload = $delta->{$srccpuid}{intrs} / $delta->{$srccpuid}{tot}; 1058 VERIFY($newload <= $srcload && $newload > $delta->{avgintrload}, 1059 "cpu2cpu: new load didn't end up in expected range"); 1060} 1061 1062 1063# find_goal() and its helper do_find_goal() are used to find the best 1064# combination of interrupts in order to generate a load that is as close 1065# as possible to a goal load without falling below that goal. Before returning 1066# to its caller, find_goal() sets a new value in the hash of each interrupt, 1067# {goal}, which if set signifies that this interrupt is one of the interrupts 1068# identified as part of the set of interrupts which best meet the goal. 1069# 1070# The arguments to find_goal are a list of ivecs (hash references), sorted 1071# by descending {time}, and the goal load. The goal is relative to {time}. 1072# The best fit is determined by performing a depth-first search. do_find_goal 1073# is the recursive subroutine which carries out the search. 1074# 1075# It is passed an index as an argument, originally 0. On a given invocation, 1076# it is only to consider interrupts in the ivecs array starting at that index. 1077# It then considers two possibilities: 1078# 1) What is the best goal-fit if I include ivecs[index]? 1079# 2) What is the best goal-fit if I exclude ivecs[index]? 1080# To determine case 1, it subtracts the load of ivecs[index] from the goal, 1081# and calls itself recursively with that new goal and index++. 1082# To determine case 2, it calls itself recursively with the same goal and 1083# index++. 1084# 1085# It then compares the two results, decide which one best meets the goals, 1086# and returns the result. The return value is the best-fit's interrupt load, 1087# followed by a list of all the interrupts which make up that best-fit. 1088# 1089# As an optimization, a second array loads[] is created which mirrors ivecs[]. 1090# loads[i] will equal the total loads of all ivecs[i..$#ivecs]. This is used 1091# by do_find_goal to avoid recursing all the way to the end of the ivecs 1092# array if including all remaining interrupts will still leave the best-fit 1093# at below goal load. If so, it then includes all remaining interrupts on 1094# the goal list and returns. 1095# 1096sub find_goal($$) # private function 1097{ 1098 my ($ivecs, $goal) = @_; 1099 1100 my @goals; 1101 my $load; 1102 my $ivec; 1103 1104 if ($goal <= 0) { 1105 @goals = (); # the empty set will best meet the goal 1106 } else { 1107 syslog('debug', "finding goal from intrs %s", 1108 ivecs_to_string(@$ivecs)); 1109 1110 # Generate @loads array 1111 1112 my $tot = 0; 1113 foreach $ivec (@$ivecs) { 1114 $tot += $ivec->{time}; 1115 } 1116 my @loads = (); 1117 foreach $ivec (@$ivecs) { 1118 push(@loads, $tot); 1119 $tot -= $ivec->{time}; 1120 } 1121 ($load, @goals) = do_find_goal($ivecs, \@loads, $goal, 0); 1122 VERIFY($load >= $goal, "find_goal didn't meet goals"); 1123 } 1124 syslog('debug', "goals found: %s", ivecs_to_string(@goals)); 1125 1126 # Set or clear $ivec->{goal} for each ivec, based on returned @goals 1127 1128 foreach $ivec (@$ivecs) { 1129 if ($#goals > -1 && $ivec == $goals[0]) { 1130 syslog('debug', "inum $ivec->{inum} on source cpu"); 1131 $ivec->{goal} = 1; 1132 shift(@goals); 1133 } else { 1134 syslog('debug', "inum $ivec->{inum} on target cpu"); 1135 $ivec->{goal} = 0; 1136 } 1137 } 1138} 1139 1140 1141sub do_find_goal($$$$) # private function 1142{ 1143 my ($ivecs, $loads, $goal, $idx) = @_; 1144 1145 if ($idx > $#{$ivecs}) { 1146 return (0); 1147 } 1148 syslog('debug', "$idx: finding goal $goal inum $ivecs->[$idx]{inum}"); 1149 1150 my $load = $ivecs->[$idx]{time}; 1151 my @goals_with = (); 1152 my @goals_without = (); 1153 my ($with, $without); 1154 1155 # If we include all remaining items and we're still below goal, 1156 # stop here. We can just return a result that includes $idx and all 1157 # subsequent ivecs. Since this will still be below goal, there's 1158 # nothing better to be done. 1159 1160 if ($loads->[$idx] <= $goal) { 1161 syslog('debug', 1162 "$idx: including all remaining intrs %s with load %d", 1163 ivecs_to_string(@$ivecs[$idx .. $#{$ivecs}]), 1164 $loads->[$idx]); 1165 return ($loads->[$idx], @$ivecs[$idx .. $#{$ivecs}]); 1166 } 1167 1168 # Evaluate the "with" option, i.e. the best matching goal which 1169 # includes $ivecs->[$idx]. If idx's load is more than our goal load, 1170 # stop here. Once we're above the goal, there is no need to consider 1171 # further interrupts since they'll only take us further from the goal. 1172 1173 if ($goal <= $load) { 1174 $with = $load; # stop here 1175 } else { 1176 ($with, @goals_with) = 1177 do_find_goal($ivecs, $loads, $goal - $load, $idx + 1); 1178 $with += $load; 1179 } 1180 syslog('debug', "$idx: with-load $with intrs %s", 1181 ivecs_to_string($ivecs->[$idx], @goals_with)); 1182 1183 # Evaluate the "without" option, i.e. the best matching goal which 1184 # excludes $ivecs->[$idx]. 1185 1186 ($without, @goals_without) = 1187 &do_find_goal($ivecs, $loads, $goal, $idx + 1); 1188 syslog('debug', "$idx: without-load $without intrs %s", 1189 ivecs_to_string(@goals_without)); 1190 1191 # We now have our "with" and "without" options, and we choose which 1192 # best fits the goal. If one is greater than goal and the other is 1193 # below goal, we choose the one that is greater. If they are both 1194 # below goal, then we choose the one that is greater. If they are 1195 # both above goal, then we choose the smaller. 1196 1197 my $which; # 0 == with, 1 == without 1198 if ($with >= $goal && $without < $goal) { 1199 $which = 0; 1200 } elsif ($with < $goal && $without >= $goal) { 1201 $which = 1; 1202 } elsif ($with >= $goal && $without >= $goal) { 1203 $which = ($without < $with); 1204 } else { 1205 $which = ($without > $with); 1206 } 1207 1208 # Return the load of our best case scenario, followed by all the ivecs 1209 # which compose that goal. 1210 1211 if ($which == 1) { # without 1212 syslog('debug', "$idx: going without"); 1213 return ($without, @goals_without); 1214 } else { 1215 syslog('debug', "$idx: going with"); 1216 return ($with, $ivecs->[$idx], @goals_with); 1217 } 1218 # Not reached 1219} 1220 1221 1222 1223 1224syslog('debug', "intrd is starting".($debug ? " (debug)" : "")); 1225 1226my @deltas = (); 1227my $deltas_tottime = 0; # sum of maxsnap-minsnap across @deltas 1228my $avggoodness; 1229my $baseline_goodness = 0; 1230my $compdelta; 1231 1232my $do_reconfig; 1233 1234# temp variables 1235my $goodness; 1236my $deltatime; 1237my $olddelta; 1238my $olddeltatime; 1239my $delta; 1240my $newstat; 1241my $below_statslen; 1242my $newtime; 1243my $ret; 1244 1245 1246my $gotsig = 0; 1247$SIG{INT} = sub { $gotsig = 1; }; # don't die in the middle of retargeting 1248$SIG{HUP} = $SIG{INT}; 1249$SIG{TERM} = $SIG{INT}; 1250 1251my $ks; 1252if ($using_scengen == 0) { 1253 $ks = Sun::Solaris::Kstat->new(); 1254} else { 1255 $ks = myks_update(); # supplied by the simulator 1256} 1257 1258# If no pci_intrs kstats were found, we need to exit, but we can't because 1259# SMF will restart us and/or report an error to the administrator. But 1260# there's nothing an administrator can do. So print out a message for SMF 1261# logs and silently pause forever. 1262 1263if (!exists($ks->{pci_intrs})) { 1264 print STDERR "$cmdname: no interrupts were found; ". 1265 "your PCI bus may not yet be supported\n"; 1266 pause() while $gotsig == 0; 1267 exit 0; 1268} 1269 1270# See if this is a system with a pcplusmp APIC. 1271# Such systems will get special handling. 1272# Assume that if one bus has a pcplusmp APIC that they all do. 1273 1274# Get a list of pci_intrs kstats. 1275my @elem = values(%{$ks->{pci_intrs}}); 1276my $elem0 = $elem[0]; 1277my $elemval = (values(%$elem0))[0]; 1278 1279# Use its buspath to query the system. It is assumed that either all or none 1280# of the busses on a system are hosted by the pcplusmp APIC. 1281my $pcplusmp_sys = is_pcplusmp($elemval->{buspath}); 1282 1283my $stat = getstat($ks, $pcplusmp_sys); 1284 1285for (;;) { 1286 sub clear_deltas { 1287 @deltas = (); 1288 $deltas_tottime = 0; 1289 $stat = 0; # prevent next gen_delta() from setting {missing} 1290 } 1291 1292 # 1. Sleep, update the kstats, and save the new stats in $newstat. 1293 1294 exit 0 if $gotsig; # if we got ^C / SIGTERM, exit 1295 if ($using_scengen == 0) { 1296 sleep($sleeptime); 1297 exit 0 if $gotsig; # if we got ^C / SIGTERM, exit 1298 $ks->update(); 1299 } else { 1300 $ks = myks_update(); 1301 } 1302 $newstat = getstat($ks, $pcplusmp_sys); 1303 1304 # $stat or $newstat could be zero if they're uninitialized, or if 1305 # getstat() failed. If $stat is zero, move $newstat to $stat, sleep 1306 # and try again. If $newstat is zero, then we also sleep and try 1307 # again, hoping the problem will clear up. 1308 1309 next if (!ref $newstat); 1310 if (!ref $stat) { 1311 $stat = $newstat; 1312 next; 1313 } 1314 1315 # 2. Compare $newstat with the prior set of values, result in %$delta. 1316 1317 $delta = generate_delta($stat, $newstat); 1318 dumpdelta($delta) if $debug; # Dump most recent stats to stdout. 1319 $stat = $newstat; # The new stats now become the old stats. 1320 1321 1322 # 3. If $delta->{missing}, then there has been a reconfiguration of 1323 # either cpus or interrupts (probably both). We need to toss out our 1324 # old set of statistics and start from scratch. 1325 # 1326 # Also, if the delta covers a very long range of time, then we've 1327 # been experiencing a system overload that has resulted in intrd 1328 # not being allowed to run effectively for a while now. As above, 1329 # toss our old statistics and start from scratch. 1330 1331 $deltatime = $delta->{maxsnap} - $delta->{minsnap}; 1332 if ($delta->{missing} > 0 || $deltatime > $statslen) { 1333 clear_deltas(); 1334 syslog('debug', "evaluating interrupt assignments"); 1335 next; 1336 } 1337 1338 1339 # 4. Incorporate new delta into the list of deltas, and associated 1340 # statistics. If we've just now received $statslen deltas, then it's 1341 # time to evaluate a reconfiguration. 1342 1343 $below_statslen = ($deltas_tottime < $statslen); 1344 $deltas_tottime += $deltatime; 1345 $do_reconfig = ($below_statslen && $deltas_tottime >= $statslen); 1346 push(@deltas, $delta); 1347 1348 # 5. Remove old deltas if total time is more than $statslen. We use 1349 # @deltas as a moving average of the last $statslen seconds. Shift 1350 # off the olders deltas, but only if that doesn't cause us to fall 1351 # below $statslen seconds. 1352 1353 while (@deltas > 1) { 1354 $olddelta = $deltas[0]; 1355 $olddeltatime = $olddelta->{maxsnap} - $olddelta->{minsnap}; 1356 $newtime = $deltas_tottime - $olddeltatime; 1357 last if ($newtime < $statslen); 1358 1359 shift(@deltas); 1360 $deltas_tottime = $newtime; 1361 } 1362 1363 # 6. The brains of the operation are here. First, check if we're 1364 # imbalanced, and if so set $do_reconfig. If $do_reconfig is set, 1365 # either because of imbalance or above in step 4, we evaluate a 1366 # new configuration. 1367 # 1368 # First, take @deltas and generate a single "compressed" delta 1369 # which summarizes them all. Pass that to do_reconfig and see 1370 # what it does with it: 1371 # 1372 # $ret == -1 : failure 1373 # $ret == 0 : current config is optimal (or close enough) 1374 # $ret == 1 : reconfiguration has occurred 1375 # 1376 # If $ret is -1 or 1, dump all our deltas and start from scratch. 1377 # Step 4 above will set do_reconfig soon thereafter. 1378 # 1379 # If $ret is 0, then nothing has happened because we're already 1380 # good enough. Set baseline_goodness to current goodness. 1381 1382 $compdelta = compress_deltas(\@deltas); 1383 if (VERIFY(ref($compdelta) eq "HASH", "couldn't compress deltas")) { 1384 clear_deltas(); 1385 next; 1386 } 1387 $compdelta->{goodness} = goodness($compdelta); 1388 dumpdelta($compdelta) if $debug; 1389 1390 $goodness = $compdelta->{goodness}; 1391 syslog('debug', "GOODNESS: %5.2f%%", $goodness * 100); 1392 1393 if ($deltas_tottime >= $statslen && 1394 imbalanced($goodness, $baseline_goodness)) { 1395 $do_reconfig = 1; 1396 } 1397 1398 if ($do_reconfig) { 1399 $ret = do_reconfig($compdelta); 1400 1401 if ($ret != 0) { 1402 clear_deltas(); 1403 syslog('debug', "do_reconfig FAILED!") if $ret == -1; 1404 } else { 1405 syslog('debug', "setting new baseline of $goodness"); 1406 $baseline_goodness = $goodness; 1407 } 1408 } 1409 syslog('debug', "---------------------------------------"); 1410} 1411