1#!/usr/perl5/bin/perl 2# 3# CDDL HEADER START 4# 5# The contents of this file are subject to the terms of the 6# Common Development and Distribution License, Version 1.0 only 7# (the "License"). You may not use this file except in compliance 8# with the License. 9# 10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 11# or http://www.opensolaris.org/os/licensing. 12# See the License for the specific language governing permissions 13# and limitations under the License. 14# 15# When distributing Covered Code, include this CDDL HEADER in each 16# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 17# If applicable, add the following below this CDDL HEADER, with the 18# fields enclosed by brackets "[]" replaced with your own identifying 19# information: Portions Copyright [yyyy] [name of copyright owner] 20# 21# CDDL HEADER END 22# 23 24# 25# Copyright 2005 Sun Microsystems, Inc. All rights reserved. 26# Use is subject to license terms. 27# 28#ident "%Z%%M% %I% %E% SMI" 29# 30 31require 5.6.1; 32use strict; 33use warnings; 34use POSIX; 35use File::Basename("basename"); 36 37my $cmdname = basename($0); 38 39my $using_scengen = 0; # 1 if using scenario simulator 40my $debug = 0; 41 42my $min_sleeptime = 1; 43my $max_sleeptime = 15; 44my $onecpu_sleeptime = (60 * 15); # used if only 1 CPU on system 45my $sleeptime = $min_sleeptime; # time to sleep between kstat updates 46 47# For timerange_foo variables, see comments at tail of &getstat() 48 49my $timerange_toohi = .01; 50my $timerange_hithresh = .0003; 51my $timerange_lothresh = $timerange_hithresh / 2; 52my $unsafe_timerange = .02; 53 54my $statslen = 60; # time period (in secs) to keep in @deltas 55 56 57# Parse arguments. intrd does not accept any public arguments; the two 58# arguments below are meant for testing purposes. -D generates a significant 59# amount of syslog output. -S <filename> loads the filename as a perl 60# script. That file is expected to implement a kstat "simulator" which 61# can be used to feed information to intrd and verify intrd's responses. 62 63while ($_ = shift @ARGV) { 64 if ($_ eq "-S" && $#ARGV != -1) { 65 $using_scengen = 1; 66 do $ARGV[0]; # load simulator 67 shift @ARGV; 68 } elsif ($_ eq "-D") { 69 $debug = 1; 70 } 71} 72 73if ($using_scengen == 0) { 74 require Sun::Solaris::Kstat; 75 require Sun::Solaris::Intrs; 76 import Sun::Solaris::Intrs(qw(intrmove)); 77 require Sys::Syslog; 78 import Sys::Syslog; 79 openlog($cmdname, 'pid', 'daemon'); 80 setlogmask(Sys::Syslog::LOG_UPTO($debug > 0 ? &Sys::Syslog::LOG_DEBUG : 81 &Sys::Syslog::LOG_INFO)); 82} 83 84 85my $asserted = 0; 86my $assert_level = 'debug'; # syslog level for assertion failures 87sub VERIFY($@) 88{ 89 my $bad = (shift() == 0); # $_[0] == 0 means assert failed 90 if ($bad) { 91 my $msg = shift(); 92 syslog($assert_level, "VERIFY: $msg", @_); 93 $asserted++; 94 } 95 return ($bad); 96} 97 98 99 100 101sub getstat($); 102sub generate_delta($$); 103sub compress_deltas($); 104sub dumpdelta($); 105 106sub goodness($); 107sub imbalanced($$); 108sub do_reconfig($); 109 110sub goodness_cpu($$); # private function 111sub move_intr($$$$); # private function 112sub ivecs_to_string(@); # private function 113sub do_find_goal($$$$); # private function 114sub find_goal($$); # private function 115sub do_reconfig_cpu2cpu($$$$); # private function 116sub do_reconfig_cpu($$$); # private function 117 118 119# 120# What follow are the basic data structures routines of intrd. 121# 122# getstat() is responsible for reading the kstats and generating a "stat" hash. 123# 124# generate_delta() is responsible for taking two "stat" hashes and creating 125# a new "delta" hash that represents what has changed over time. 126# 127# compress_deltas() is responsible for taking a list of deltas and generating 128# a single delta hash that encompasses all the time periods described by the 129# deltas. 130 131 132# 133# getstat() is handed a reference to a kstat and generates a hash, returned 134# by reference, containing all the fields from the kstats which we need. 135# If it returns the scalar 0, it failed to gather the kstats, and the caller 136# should react accordingly. 137# 138# getstat() is also responsible for maintaining a reasonable $sleeptime. 139# 140# {"snaptime"} kstat's snaptime 141# {<cpuid>} one hash reference per online cpu 142# ->{"tot"} == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle} 143# ->{"crtime"} == cpu:<cpuid>:sys:crtime 144# ->{"ivecs"} 145# ->{<cookie#>} iterates over pci_intrs::config:cookie 146# ->{"time"} == pci_intrs:<ivec#>:config:time (in nsec) 147# ->{"pil"} == pci_intrs:<ivec#>:config:pil 148# ->{"crtime"} == pci_intrs:<ivec#>:config:crtime 149# ->{"ino"} == pci_intrs:<ivec#>:config:ino 150# ->{"buspath"} == pci_intrs:<ivec#>:config:buspath 151# ->{"name"} == pci_intrs:<ivec#>:config:name 152# ->{"ihs"} == pci_intrs:<ivec#>:config:ihs 153# 154 155sub getstat($) 156{ 157 my ($ks) = @_; 158 159 my $cpucnt = 0; 160 my %stat = (); 161 my ($minsnap, $maxsnap); 162 163 # kstats are not generated atomically. Each kstat hierarchy will 164 # have been generated within the kernel at a different time. On a 165 # thrashing system, we may not run quickly enough in order to get 166 # coherent kstat timing information across all the kstats. To 167 # determine if this is occurring, $minsnap/$maxsnap are used to 168 # find the breadth between the first and last snaptime of all the 169 # kstats we access. $maxsnap - $minsnap roughly represents the 170 # total time taken up in getstat(). If this time approaches the 171 # time between snapshots, our results may not be useful. 172 173 $minsnap = -1; # snaptime is always a positive number 174 $maxsnap = $minsnap; 175 176 # Iterate over the cpus in cpu:<cpuid>::. Check 177 # cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the 178 # processor is "on-line". If not, it isn't accepting interrupts 179 # and doesn't concern us. 180 # 181 # Record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap. 182 183 while (my ($cpu, $cpst) = each %{$ks->{cpu}}) { 184 next if !exists($ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state}); 185 my $state = $ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state}; 186 next if ($state !~ /^on-line\0/); 187 my $cpu_sys = $cpst->{sys}; 188 189 $stat{$cpu}{tot} = ($cpu_sys->{cpu_nsec_idle} + 190 $cpu_sys->{cpu_nsec_user} + 191 $cpu_sys->{cpu_nsec_kernel}); 192 $stat{$cpu}{crtime} = $cpu_sys->{crtime}; 193 $stat{$cpu}{ivecs} = {}; 194 195 if ($minsnap == -1 || $cpu_sys->{snaptime} < $minsnap) { 196 $minsnap = $cpu_sys->{snaptime}; 197 } 198 if ($cpu_sys->{snaptime} > $maxsnap) { 199 $maxsnap = $cpu_sys->{snaptime}; 200 } 201 $cpucnt++; 202 } 203 204 if ($cpucnt <= 1) { 205 $sleeptime = $onecpu_sleeptime; 206 return (0); # nothing to do with 1 CPU 207 } 208 209 # Iterate over the ivecs. If the cpu is not on-line, ignore the 210 # ivecs mapped to it, if any. 211 # 212 # Record pci_intrs:{inum}:config:time, snaptime, crtime, pil, 213 # ino, name, and buspath. Check $minsnap/$maxsnap. 214 215 foreach my $inst (values(%{$ks->{pci_intrs}})) { 216 my $intrcfg = $inst->{config}; 217 my $cpu = $intrcfg->{cpu}; 218 219 next unless exists $stat{$cpu}; 220 next if ($intrcfg->{type} =~ /^disabled\0/); 221 222 if ($intrcfg->{snaptime} < $minsnap) { 223 $minsnap = $intrcfg->{snaptime}; 224 } elsif ($intrcfg->{snaptime} > $maxsnap) { 225 $maxsnap = $intrcfg->{snaptime}; 226 } 227 228 my $cookie = "$intrcfg->{buspath} $intrcfg->{ino}"; 229 if (exists $stat{$cpu}{ivecs}{$cookie}) { 230 my $cookiestats = $stat{$cpu}{ivecs}{$cookie}; 231 232 $cookiestats->{time} += $intrcfg->{time}; 233 $cookiestats->{name} .= "/$intrcfg->{name}"; 234 235 # If this new interrupt sharing $cookie represents a 236 # change from an earlier getstat, make sure that 237 # generate_delta will see the change by setting 238 # crtime to the most recent crtime of its components. 239 240 if ($intrcfg->{crtime} > $cookiestats->{crtime}) { 241 $cookiestats->{crtime} = $intrcfg->{crtime}; 242 } 243 $cookiestats->{ihs}++; 244 next; 245 } 246 $stat{$cpu}{ivecs}{$cookie}{time} = $intrcfg->{time}; 247 $stat{$cpu}{ivecs}{$cookie}{crtime} = $intrcfg->{crtime}; 248 $stat{$cpu}{ivecs}{$cookie}{pil} = $intrcfg->{pil}; 249 $stat{$cpu}{ivecs}{$cookie}{ino} = $intrcfg->{ino}; 250 $stat{$cpu}{ivecs}{$cookie}{buspath} = $intrcfg->{buspath}; 251 $stat{$cpu}{ivecs}{$cookie}{name} = $intrcfg->{name}; 252 $stat{$cpu}{ivecs}{$cookie}{ihs} = 1; 253 } 254 255 # We define the timerange as the amount of time spent gathering the 256 # various kstats, divided by our sleeptime. If we take a lot of time 257 # to access the kstats, and then we create a delta comparing these 258 # kstats with a prior set of kstats, that delta will cover 259 # substaintially different amount of time depending upon which 260 # interrupt or CPU is being examined. 261 # 262 # By checking the timerange here, we guarantee that any deltas 263 # created from these kstats will contain self-consistent data, 264 # in that all CPUs and interrupts cover a similar span of time. 265 # 266 # We attempt to keep this timerange between $timerange_lothresh and 267 # $timerange_hithresh. If the timerange gets too large, not only are 268 # there the accuracy concerns above, but it means that intrd is using 269 # a lot of CPU time. If the timerange gets too small, that means our 270 # sleep time is large, and we could fail to react quickly enough to a 271 # sudden change. 272 # 273 # Finally, $timerange_toohi is the upper bound. Any timerange above 274 # this is thrown out as garbage. If the stat is safely within this 275 # bound, we treat the stat as representing an instant in time, rather 276 # than the time range it actually spans. We arbitrarily choose minsnap 277 # as the snaptime of the stat. 278 279 $stat{snaptime} = $minsnap; 280 my $timerange = ($maxsnap - $minsnap) / $sleeptime; 281 if ($sleeptime == $onecpu_sleeptime) { 282 $sleeptime = $min_sleeptime; # time to come out of idling 283 } elsif ($timerange > $timerange_hithresh && 284 $sleeptime < $max_sleeptime) { 285 $sleeptime++; 286 } elsif ($timerange < $timerange_lothresh && 287 $sleeptime > $min_sleeptime) { 288 $sleeptime--; 289 } 290 return (0) if ($timerange > $timerange_toohi); # i.e. failure 291 return (\%stat); 292} 293 294# 295# dumpdelta takes a reference to our "delta" structure: 296# {"missing"} "1" if the delta's component stats had inconsistencies 297# {"minsnap"} time of the first kstat snaptime used in this delta 298# {"maxsnap"} time of the last kstat snaptime used in this delta 299# {"goodness"} cost function applied to this delta 300# {"avgintrload"} avg of interrupt load across cpus, as a percentage 301# {"avgintrnsec"} avg number of nsec spent in interrupts, per cpu 302# {<cpuid>} iterates over on-line cpus 303# ->{"intrs"} cpu's movable intr time (sum of "time" for each ivec) 304# ->{"tot"} CPU load from all sources 305# ->{"bigintr"} largest value of {ivecs}{<ivec#>}{time} from below 306# ->{"intrload"} intrs / tot 307# ->{"ivecs"} 308# ->{<ivec#>} iterates over ivecs for this cpu 309# ->{"time"} time used by this interrupt (in nsec) 310# ->{"pil"} pil level of this interrupt 311# ->{"ino"} interrupt number 312# ->{"buspath"} filename of the directory of the device's bus 313# ->{"name"} device name 314# ->{"ihs"} number of different handlers sharing this ino 315# 316# It prints out the delta structure in a nice, human readable display. 317# 318 319sub dumpdelta($) 320{ 321 my ($delta) = @_; 322 323 # print global info 324 325 syslog('debug', "dumpdelta:"); 326 syslog('debug', " RECONFIGURATION IN DELTA") if $delta->{missing} > 0; 327 syslog('debug', " avgintrload: %5.2f%% avgintrnsec: %d", 328 $delta->{avgintrload} * 100, $delta->{avgintrnsec}); 329 syslog('debug', " goodness: %5.2f%%", $delta->{goodness} * 100) 330 if exists($delta->{goodness}); 331 332 # iterate over cpus 333 334 while (my ($cpu, $cpst) = each %$delta) { 335 next if !ref($cpst); # skip non-cpuid entries 336 my $tot = $cpst->{tot}; 337 syslog('debug', " cpu %3d intr %7.3f%% (bigintr %7.3f%%)", 338 $cpu, $cpst->{intrload}*100, $cpst->{bigintr}*100/$tot); 339 syslog('debug', " intrs %d, bigintr %d", 340 $cpst->{intrs}, $cpst->{bigintr}); 341 342 # iterate over ivecs on this cpu 343 344 while (my ($ivec, $ivst) = each %{$cpst->{ivecs}}) { 345 syslog('debug', " %15s:\"%s\": %7.3f%% %d", 346 ($ivst->{ihs} > 1 ? "$ivst->{name}($ivst->{ihs})" : 347 $ivst->{name}), $ivec, 348 $ivst->{time}*100 / $tot, $ivst->{time}); 349 } 350 } 351} 352 353# 354# generate_delta($stat, $newstat) takes two stat references, returned from 355# getstat(), and creates a %delta. %delta (not surprisingly) contains the 356# same basic info as stat and newstat, but with the timestamps as deltas 357# instead of absolute times. We return a reference to the delta. 358# 359 360sub generate_delta($$) 361{ 362 my ($stat, $newstat) = @_; 363 364 my %delta = (); 365 my $intrload; 366 my $intrnsec; 367 my $cpus; 368 369 # Take the worstcase timerange 370 $delta{minsnap} = $stat->{snaptime}; 371 $delta{maxsnap} = $newstat->{snaptime}; 372 if (VERIFY($delta{maxsnap} > $delta{minsnap}, 373 "generate_delta: stats aren't ascending")) { 374 $delta{missing} = 1; 375 return (\%delta); 376 } 377 378 # if there are a different number of cpus in the stats, set missing 379 380 $delta{missing} = (keys(%$stat) != keys(%$newstat)); 381 if (VERIFY($delta{missing} == 0, 382 "generate_delta: number of CPUs changed")) { 383 return (\%delta); 384 } 385 386 # scan through every cpu in %newstat and compare against %stat 387 388 while (my ($cpu, $newcpst) = each %$newstat) { 389 next if !ref($newcpst); # skip non-cpuid fields 390 391 # If %stat is missing a cpu from %newstat, then it was just 392 # onlined. Mark missing. 393 394 if (VERIFY(exists $stat->{$cpu} && 395 $stat->{$cpu}{crtime} == $newcpst->{crtime}, 396 "generate_delta: cpu $cpu changed")) { 397 $delta{missing} = 1; 398 return (\%delta); 399 } 400 my $cpst = $stat->{$cpu}; 401 $delta{$cpu}{tot} = $newcpst->{tot} - $cpst->{tot}; 402 if (VERIFY($delta{$cpu}{tot} >= 0, 403 "generate_delta: deltas are not ascending?")) { 404 $delta{missing} = 1; 405 delete($delta{$cpu}); 406 return (\%delta); 407 } 408 # Avoid remote chance of division by zero 409 $delta{$cpu}{tot} = 1 if $delta{$cpu}{tot} == 0; 410 $delta{$cpu}{intrs} = 0; 411 $delta{$cpu}{bigintr} = 0; 412 413 my %ivecs = (); 414 $delta{$cpu}{ivecs} = \%ivecs; 415 416 # if the number of ivecs differs, set missing 417 418 if (VERIFY(keys(%{$cpst->{ivecs}}) == 419 keys(%{$newcpst->{ivecs}}), 420 "generate_delta: cpu $cpu has more/less". 421 " interrupts")) { 422 $delta{missing} = 1; 423 return (\%delta); 424 } 425 426 while (my ($inum, $newivec) = each %{$newcpst->{ivecs}}) { 427 # If this ivec doesn't exist in $stat, or if $stat 428 # shows a different crtime, set missing. 429 430 if (VERIFY(exists $cpst->{ivecs}{$inum} && 431 $cpst->{ivecs}{$inum}{crtime} == 432 $newivec->{crtime}, 433 "generate_delta: cpu $cpu inum $inum". 434 " has changed")) { 435 $delta{missing} = 1; 436 return (\%delta); 437 } 438 my $ivec = $cpst->{ivecs}{$inum}; 439 440 # Create $delta{$cpu}{ivecs}{$inum}. 441 442 my %dltivec = (); 443 $delta{$cpu}{ivecs}{$inum} = \%dltivec; 444 445 # calculate time used by this interrupt 446 447 my $time = $newivec->{time} - $ivec->{time}; 448 if (VERIFY($time >= 0, 449 "generate_delta: ivec went backwards?")) { 450 $delta{missing} = 1; 451 delete($delta{$cpu}{ivecs}{$inum}); 452 return (\%delta); 453 } 454 $delta{$cpu}{intrs} += $time; 455 $dltivec{time} = $time; 456 if ($time > $delta{$cpu}{bigintr}) { 457 $delta{$cpu}{bigintr} = $time; 458 } 459 460 # Transfer over basic info about the kstat. We 461 # don't have to worry about discrepancies between 462 # ivec and newivec because we verified that both 463 # have the same crtime. 464 465 $dltivec{pil} = $newivec->{pil}; 466 $dltivec{ino} = $newivec->{ino}; 467 $dltivec{buspath} = $newivec->{buspath}; 468 $dltivec{name} = $newivec->{name}; 469 $dltivec{ihs} = $newivec->{ihs}; 470 } 471 if ($delta{$cpu}{tot} < $delta{$cpu}{intrs}) { 472 # Ewww! Hopefully just a rounding error. 473 # Make something up. 474 $delta{$cpu}{tot} = $delta{$cpu}{intrs}; 475 } 476 $delta{$cpu}{intrload} = 477 $delta{$cpu}{intrs} / $delta{$cpu}{tot}; 478 $intrload += $delta{$cpu}{intrload}; 479 $intrnsec += $delta{$cpu}{intrs}; 480 $cpus++; 481 } 482 if ($cpus > 0) { 483 $delta{avgintrload} = $intrload / $cpus; 484 $delta{avgintrnsec} = $intrnsec / $cpus; 485 } else { 486 $delta{avgintrload} = 0; 487 $delta{avgintrnsec} = 0; 488 } 489 return (\%delta); 490} 491 492 493# compress_delta takes a list of deltas, and returns a single new delta 494# which represents the combined information from all the deltas. The deltas 495# provided are assumed to be sequential in time. The resulting compressed 496# delta looks just like any other delta. This new delta is also more accurate 497# since its statistics are averaged over a longer period than any of the 498# original deltas. 499 500sub compress_deltas ($) 501{ 502 my ($deltas) = @_; 503 504 my %newdelta = (); 505 my ($intrs, $tot); 506 my $cpus = 0; 507 508 if (VERIFY($#$deltas != -1, 509 "compress_deltas: list of delta is empty?")) { 510 return (0); 511 } 512 $newdelta{minsnap} = $deltas->[0]{minsnap}; 513 $newdelta{maxsnap} = $deltas->[$#$deltas]{maxsnap}; 514 $newdelta{missing} = 0; 515 516 foreach my $delta (@$deltas) { 517 if (VERIFY($delta->{missing} == 0, 518 "compressing bad deltas?")) { 519 return (0); 520 } 521 while (my ($cpuid, $cpu) = each %$delta) { 522 next if !ref($cpu); 523 524 $intrs += $cpu->{intrs}; 525 $tot += $cpu->{tot}; 526 $newdelta{$cpuid}{intrs} += $cpu->{intrs}; 527 $newdelta{$cpuid}{tot} += $cpu->{tot}; 528 if (!exists $newdelta{$cpuid}{ivecs}) { 529 my %ivecs = (); 530 $newdelta{$cpuid}{ivecs} = \%ivecs; 531 } 532 while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) { 533 my $newivecs = $newdelta{$cpuid}{ivecs}; 534 $newivecs->{$inum}{time} += $ivec->{time}; 535 $newivecs->{$inum}{pil} = $ivec->{pil}; 536 $newivecs->{$inum}{ino} = $ivec->{ino}; 537 $newivecs->{$inum}{buspath} = $ivec->{buspath}; 538 $newivecs->{$inum}{name} = $ivec->{name}; 539 $newivecs->{$inum}{ihs} = $ivec->{ihs}; 540 } 541 } 542 } 543 foreach my $cpu (values(%newdelta)) { 544 next if !ref($cpu); # ignore non-cpu fields 545 $cpus++; 546 547 my $bigintr = 0; 548 foreach my $ivec (values(%{$cpu->{ivecs}})) { 549 if ($ivec->{time} > $bigintr) { 550 $bigintr = $ivec->{time}; 551 } 552 } 553 $cpu->{bigintr} = $bigintr; 554 $cpu->{intrload} = $cpu->{intrs} / $cpu->{tot}; 555 $cpu->{tot} = 1 if $cpu->{tot} <= 0; 556 } 557 if ($cpus == 0) { 558 $newdelta{avgintrnsec} = 0; 559 $newdelta{avgintrload} = 0; 560 } else { 561 $newdelta{avgintrnsec} = $intrs / $cpus; 562 $newdelta{avgintrload} = $intrs / $tot; 563 } 564 return (\%newdelta); 565} 566 567 568 569 570 571# What follow are the core functions responsible for examining the deltas 572# generated above and deciding what to do about them. 573# 574# goodness() and its helper goodness_cpu() return a heuristic which describe 575# how good (or bad) the current interrupt balance is. The value returned will 576# be between 0 and 1, with 0 representing maximum goodness, and 1 representing 577# maximum badness. 578# 579# imbalanced() compares a current and historical value of goodness, and 580# determines if there has been enough change to warrant evaluating a 581# reconfiguration of the interrupts 582# 583# do_reconfig(), and its helpers, do_reconfig_cpu(), do_reconfig_cpu2cpu(), 584# find_goal(), do_find_goal(), and move_intr(), are responsible for examining 585# a delta and determining the best possible assignment of interrupts to CPUs. 586# 587# It is important that do_reconfig() be in alignment with goodness(). If 588# do_reconfig were to generate a new interrupt distribution that worsened 589# goodness, we could get into a pathological loop with intrd fighting itself, 590# constantly deciding that things are imbalanced, and then changing things 591# only to make them worse. 592 593 594 595# any goodness over $goodness_unsafe_load is considered really bad 596# goodness must drop by at least $goodness_mindelta for a reconfig 597 598my $goodness_unsafe_load = .9; 599my $goodness_mindelta = .1; 600 601# goodness(%delta) examines a delta and return its "goodness". goodness will 602# be between 0 (best) and 1 (major bad). goodness is determined by evaluating 603# the goodness of each individual cpu, and returning the worst case. This 604# helps on systems with many CPUs, where otherwise a single pathological CPU 605# might otherwise be ignored because the average was OK. 606# 607# To calculate the goodness of an individual CPU, we start by looking at its 608# load due to interrupts. If the load is above a certain high threshold and 609# there is more than one interrupt assigned to this CPU, we set goodness 610# to worst-case. If the load is below the average interrupt load of all CPUs, 611# then we return best-case, since what's to complain about? 612# 613# Otherwise we look at how much the load is above the average, and return 614# that as the goodness, with one caveat: we never return more than the CPU's 615# interrupt load ignoring its largest single interrupt source. This is 616# because a CPU with one high-load interrupt, and no other interrupts, is 617# perfectly balanced. Nothing can be done to improve the situation, and thus 618# it is perfectly balanced even if the interrupt's load is 100%. 619 620sub goodness($) 621{ 622 my ($delta) = @_; 623 624 return (1) if $delta->{missing} > 0; 625 626 my $high_goodness = 0; 627 my $goodness; 628 629 foreach my $cpu (values(%$delta)) { 630 next if !ref($cpu); # skip non-cpuid fields 631 632 $goodness = goodness_cpu($cpu, $delta->{avgintrload}); 633 if (VERIFY($goodness >= 0 && $goodness <= 1, 634 "goodness: cpu goodness out of range?")) { 635 dumpdelta($delta); 636 return (1); 637 } 638 if ($goodness == 1) { 639 return (1); # worst case, no need to continue 640 } 641 if ($goodness > $high_goodness) { 642 $high_goodness = $goodness; 643 } 644 } 645 return ($high_goodness); 646} 647 648sub goodness_cpu($$) # private function 649{ 650 my ($cpu, $avgintrload) = @_; 651 652 my $goodness; 653 my $load = $cpu->{intrs} / $cpu->{tot}; 654 655 return (0) if ($load < $avgintrload); # low loads are perfectly good 656 657 # Calculate $load_no_bigintr, which represents the load 658 # due to interrupts, excluding the one biggest interrupt. 659 # This is the most gain we can get on this CPU from 660 # offloading interrupts. 661 662 my $load_no_bigintr = ($cpu->{intrs} - $cpu->{bigintr}) / $cpu->{tot}; 663 664 # A major imbalance is indicated if a CPU is saturated 665 # with interrupt handling, and it has more than one 666 # source of interrupts. Those other interrupts could be 667 # starved if of a lower pil. Return a goodness of 1, 668 # which is the worst possible return value, 669 # which will effectively contaminate this entire delta. 670 671 my $cnt = keys(%{$cpu->{ivecs}}); 672 673 if ($load > $goodness_unsafe_load && $cnt > 1) { 674 return (1); 675 } 676 $goodness = $load - $avgintrload; 677 if ($goodness > $load_no_bigintr) { 678 $goodness = $load_no_bigintr; 679 } 680 return ($goodness); 681} 682 683 684# imbalanced() is used by the main routine to determine if the goodness 685# has shifted far enough from our last baseline to warrant a reassignment 686# of interrupts. A very high goodness indicates that a CPU is way out of 687# whack. If the goodness has varied too much since the baseline, then 688# perhaps a reconfiguration is worth considering. 689 690sub imbalanced ($$) 691{ 692 my ($goodness, $baseline) = @_; 693 694 # Return 1 if we are pathological, or creeping away from the baseline 695 696 return (1) if $goodness > .50; 697 return (1) if abs($goodness - $baseline) > $goodness_mindelta; 698 return (0); 699} 700 701# do_reconfig(), do_reconfig_cpu(), and do_reconfig_cpu2cpu(), are the 702# decision-making functions responsible for generating a new interrupt 703# distribution. They are designed with the definition of goodness() in 704# mind, i.e. they use the same definition of "good distribution" as does 705# goodness(). 706# 707# do_reconfig() is responsible for deciding whether a redistribution is 708# actually warranted. If the goodness is already pretty good, it doesn't 709# waste the CPU time to generate a new distribution. If it 710# calculates a new distribution and finds that it is not sufficiently 711# improved from the prior distirbution, it will not do the redistribution, 712# mainly to avoid the disruption to system performance caused by 713# rejuggling interrupts. 714# 715# Its main loop works by going through a list of cpus sorted from 716# highest to lowest interrupt load. It removes the highest-load cpus 717# one at a time and hands them off to do_reconfig_cpu(). This function 718# then re-sorts the remaining CPUs from lowest to highest interrupt load, 719# and one at a time attempts to rejuggle interrupts between the original 720# high-load CPU and the low-load CPU. Rejuggling on a high-load CPU is 721# considered finished as soon as its interrupt load is within 722# $goodness_mindelta of the average interrupt load. Such a CPU will have 723# a goodness of below the $goodness_mindelta threshold. 724 725# 726# move_intr(\%delta, $inum, $oldcpu, $newcpu) 727# used by reconfiguration code to move an interrupt between cpus within 728# a delta. This manipulates data structures, and does not actually move 729# the interrupt on the running system. 730# 731sub move_intr($$$$) # private function 732{ 733 my ($delta, $inum, $oldcpuid, $newcpuid) = @_; 734 735 my $ivec = $delta->{$oldcpuid}{ivecs}{$inum}; 736 737 # Remove ivec from old cpu 738 739 my $oldcpu = $delta->{$oldcpuid}; 740 $oldcpu->{intrs} -= $ivec->{time}; 741 $oldcpu->{intrload} = $oldcpu->{intrs} / $oldcpu->{tot}; 742 delete($oldcpu->{ivecs}{$inum}); 743 744 VERIFY($oldcpu->{intrs} >= 0, "move_intr: intr's time > total time?"); 745 VERIFY($ivec->{time} <= $oldcpu->{bigintr}, 746 "move_intr: intr's time > bigintr?"); 747 748 if ($ivec->{time} >= $oldcpu->{bigintr}) { 749 my $bigtime = 0; 750 751 foreach my $ivec (values(%{$oldcpu->{ivecs}})) { 752 $bigtime = $ivec->{time} if $ivec->{time} > $bigtime; 753 } 754 $oldcpu->{bigintr} = $bigtime; 755 } 756 757 # Add ivec onto new cpu 758 759 my $newcpu = $delta->{$newcpuid}; 760 761 $ivec->{nowcpu} = $newcpuid; 762 $newcpu->{intrs} += $ivec->{time}; 763 $newcpu->{intrload} = $newcpu->{intrs} / $newcpu->{tot}; 764 $newcpu->{ivecs}{$inum} = $ivec; 765 766 $newcpu->{bigintr} = $ivec->{time} 767 if $ivec->{time} > $newcpu->{bigintr}; 768} 769 770sub move_intr_check($$$) # private function 771{ 772 my ($delta, $oldcpuid, $newcpuid) = @_; 773 774 VERIFY($delta->{$oldcpuid}{tot} >= $delta->{$oldcpuid}{intrs}, 775 "Moved interrupts left 100+%% load on src cpu"); 776 VERIFY($delta->{$newcpuid}{tot} >= $delta->{$newcpuid}{intrs}, 777 "Moved interrupts left 100+%% load on tgt cpu"); 778} 779 780sub ivecs_to_string(@) # private function 781{ 782 my $str = ""; 783 foreach my $ivec (@_) { 784 $str = "$str $ivec->{inum}"; 785 } 786 return ($str); 787} 788 789 790sub do_reconfig($) 791{ 792 my ($delta) = @_; 793 794 my $goodness = $delta->{goodness}; 795 796 # We can't improve goodness to better than 0. We should stop here 797 # if, even if we achieve a goodness of 0, the improvement is still 798 # too small to merit the action. 799 800 if ($goodness - 0 < $goodness_mindelta) { 801 syslog('debug', "goodness good enough, don't reconfig"); 802 return (0); 803 } 804 805 syslog('notice', "Optimizing interrupt assignments"); 806 807 if (VERIFY ($delta->{missing} == 0, "RECONFIG Aborted: should not ". 808 "have a delta with missing")) { 809 return (-1); 810 } 811 812 # Make a list of all cpuids, and also add some extra information 813 # to the ivec structures. 814 815 my @cpusortlist = (); 816 817 while (my ($cpuid, $cpu) = each %$delta) { 818 next if !ref($cpu); # skip non-cpu entries 819 820 push(@cpusortlist, $cpuid); 821 while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) { 822 $ivec->{origcpu} = $cpuid; 823 $ivec->{nowcpu} = $cpuid; 824 $ivec->{inum} = $inum; 825 } 826 } 827 828 # Sort the list of CPUs from highest to lowest interrupt load. 829 # Remove the top CPU from that list and attempt to redistribute 830 # its interrupts. If the CPU has a goodness below a threshold, 831 # just ignore the CPU and move to the next one. If the CPU's 832 # load falls below the average load plus that same threshold, 833 # then there are no CPUs left worth reconfiguring, and we're done. 834 835 while (@cpusortlist) { 836 # Re-sort cpusortlist each time, since do_reconfig_cpu can 837 # move interrupts around. 838 839 @cpusortlist = 840 sort({$delta->{$b}{intrload} <=> $delta->{$a}{intrload}} 841 @cpusortlist); 842 843 my $cpu = shift(@cpusortlist); 844 if (($delta->{$cpu}{intrload} <= $goodness_unsafe_load) && 845 ($delta->{$cpu}{intrload} <= 846 $delta->{avgintrload} + $goodness_mindelta)) { 847 syslog('debug', "finished reconfig: cpu $cpu load ". 848 "$delta->{$cpu}{intrload} avgload ". 849 "$delta->{avgintrload}"); 850 last; 851 } 852 if (goodness_cpu($delta->{$cpu}, $delta->{avgintrload}) < 853 $goodness_mindelta) { 854 next; 855 } 856 do_reconfig_cpu($delta, \@cpusortlist, $cpu); 857 } 858 859 # How good a job did we do? If the improvement was minimal, and 860 # our goodness wasn't pathological (and thus needing any help it 861 # can get), then don't bother moving the interrupts. 862 863 my $newgoodness = goodness($delta); 864 VERIFY($newgoodness <= $goodness, 865 "reconfig: result has worse goodness?"); 866 867 if (($goodness != 1 || $newgoodness == 1) && 868 $goodness - $newgoodness < $goodness_mindelta) { 869 syslog('debug', "goodness already near optimum, ". 870 "don't reconfig"); 871 return (0); 872 } 873 syslog('debug', "goodness %5.2f%% --> %5.2f%%", $goodness*100, 874 $newgoodness*100); 875 876 # Time to move those interrupts! 877 878 my $ret = 1; 879 my $warned = 0; 880 while (my ($cpuid, $cpu) = each %$delta) { 881 next if $cpuid =~ /\D/; 882 while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) { 883 next if ($ivec->{origcpu} == $cpuid); 884 885 if (!intrmove($ivec->{buspath}, $ivec->{ino}, 886 $cpuid)) { 887 syslog('warning', "Unable to move interrupts") 888 if $warned++ == 0; 889 syslog('debug', "Unable to move buspath ". 890 "$ivec->{buspath} ino $ivec->{ino} to ". 891 "cpu $cpuid"); 892 $ret = -1; 893 } 894 } 895 } 896 897 syslog('notice', "Interrupt assignments optimized"); 898 return ($ret); 899} 900 901sub do_reconfig_cpu($$$) # private function 902{ 903 my ($delta, $cpusortlist, $oldcpuid) = @_; 904 905 # We have been asked to rejuggle interrupts between $oldcpuid and 906 # other CPUs found on $cpusortlist so as to improve the load on 907 # $oldcpuid. We reverse $cpusortlist to get our own copy of the 908 # list, sorted from lowest to highest interrupt load. One at a 909 # time, shift a CPU off of this list of CPUs, and attempt to 910 # rejuggle interrupts between the two CPUs. Don't do this if the 911 # other CPU has a higher load than oldcpuid. We're done rejuggling 912 # once $oldcpuid's goodness falls below a threshold. 913 914 syslog('debug', "reconfiguring $oldcpuid"); 915 916 my $cpu = $delta->{$oldcpuid}; 917 my $avgintrload = $delta->{avgintrload}; 918 919 my @cputargetlist = reverse(@$cpusortlist); # make a copy of the list 920 while ($#cputargetlist != -1) { 921 last if goodness_cpu($cpu, $avgintrload) < $goodness_mindelta; 922 923 my $tgtcpuid = shift(@cputargetlist); 924 my $tgt = $delta->{$tgtcpuid}; 925 my $load = $cpu->{intrload}; 926 my $tgtload = $tgt->{intrload}; 927 last if $tgtload > $load; 928 do_reconfig_cpu2cpu($delta, $oldcpuid, $tgtcpuid, $load); 929 } 930} 931 932sub do_reconfig_cpu2cpu($$$$) # private function 933{ 934 my ($delta, $srccpuid, $tgtcpuid, $srcload) = @_; 935 936 # We've been asked to consider interrupt juggling between srccpuid 937 # (with a high interrupt load) and tgtcpuid (with a lower interrupt 938 # load). First, make a single list with all of the ivecs from both 939 # CPUs, and sort the list from highest to lowest load. 940 941 syslog('debug', "exchanging intrs between $srccpuid and $tgtcpuid"); 942 943 # Gather together all the ivecs and sort by load 944 945 my @ivecs = (values(%{$delta->{$srccpuid}{ivecs}}), 946 values(%{$delta->{$tgtcpuid}{ivecs}})); 947 return if $#ivecs == -1; 948 949 @ivecs = sort({$b->{time} <=> $a->{time}} @ivecs); 950 951 # Our "goal" load for srccpuid is the average load across all CPUs. 952 # find_goal() will find determine the optimum selection of the 953 # available interrupts which comes closest to this goal without 954 # falling below the goal. 955 956 my $goal = $delta->{avgintrnsec}; 957 958 # We know that the interrupt load on tgtcpuid is less than that on 959 # srccpuid, but its load could still be above avgintrnsec. Don't 960 # choose a goal which would bring srccpuid below the load on tgtcpuid. 961 962 my $avgnsec = 963 ($delta->{$srccpuid}{intrs} + $delta->{$tgtcpuid}{intrs}) / 2; 964 if ($goal < $avgnsec) { 965 $goal = $avgnsec; 966 } 967 968 # If the largest of the interrupts is on srccpuid, leave it there. 969 # This can help minimize the disruption caused by moving interrupts. 970 971 if ($ivecs[0]->{origcpu} == $srccpuid) { 972 syslog('debug', "Keeping $ivecs[0]->{inum} on $srccpuid"); 973 $goal -= $ivecs[0]->{time}; 974 shift(@ivecs); 975 } 976 977 syslog('debug', "GOAL: inums should total $goal"); 978 find_goal(\@ivecs, $goal); 979 980 # find_goal() returned its results to us by setting $ivec->{goal} if 981 # the ivec should be on srccpuid, or clearing it for tgtcpuid. 982 # Call move_intr() to update our $delta with the new results. 983 984 foreach my $ivec (@ivecs) { 985 syslog('debug', "ivec $ivec->{inum} goal $ivec->{goal}"); 986 VERIFY($ivec->{nowcpu} == $srccpuid || 987 $ivec->{nowcpu} == $tgtcpuid, "cpu2cpu found an ". 988 "interrupt not currently on src or tgt cpu"); 989 990 if ($ivec->{goal} && $ivec->{nowcpu} != $srccpuid) { 991 move_intr($delta, $ivec->{inum}, $ivec->{nowcpu}, 992 $srccpuid); 993 } elsif ($ivec->{goal} == 0 && $ivec->{nowcpu} != $tgtcpuid) { 994 move_intr($delta, $ivec->{inum}, $ivec->{nowcpu}, 995 $tgtcpuid); 996 } 997 } 998 move_intr_check($delta, $srccpuid, $tgtcpuid); # asserts 999 1000 my $newload = $delta->{$srccpuid}{intrs} / $delta->{$srccpuid}{tot}; 1001 VERIFY($newload <= $srcload && $newload > $delta->{avgintrload}, 1002 "cpu2cpu: new load didn't end up in expected range"); 1003} 1004 1005 1006# find_goal() and its helper do_find_goal() are used to find the best 1007# combination of interrupts in order to generate a load that is as close 1008# as possible to a goal load without falling below that goal. Before returning 1009# to its caller, find_goal() sets a new value in the hash of each interrupt, 1010# {goal}, which if set signifies that this interrupt is one of the interrupts 1011# identified as part of the set of interrupts which best meet the goal. 1012# 1013# The arguments to find_goal are a list of ivecs (hash references), sorted 1014# by descending {time}, and the goal load. The goal is relative to {time}. 1015# The best fit is determined by performing a depth-first search. do_find_goal 1016# is the recursive subroutine which carries out the search. 1017# 1018# It is passed an index as an argument, originally 0. On a given invocation, 1019# it is only to consider interrupts in the ivecs array starting at that index. 1020# It then considers two possibilities: 1021# 1) What is the best goal-fit if I include ivecs[index]? 1022# 2) What is the best goal-fit if I exclude ivecs[index]? 1023# To determine case 1, it subtracts the load of ivecs[index] from the goal, 1024# and calls itself recursively with that new goal and index++. 1025# To determine case 2, it calls itself recursively with the same goal and 1026# index++. 1027# 1028# It then compares the two results, decide which one best meets the goals, 1029# and returns the result. The return value is the best-fit's interrupt load, 1030# followed by a list of all the interrupts which make up that best-fit. 1031# 1032# As an optimization, a second array loads[] is created which mirrors ivecs[]. 1033# loads[i] will equal the total loads of all ivecs[i..$#ivecs]. This is used 1034# by do_find_goal to avoid recursing all the way to the end of the ivecs 1035# array if including all remaining interrupts will still leave the best-fit 1036# at below goal load. If so, it then includes all remaining interrupts on 1037# the goal list and returns. 1038# 1039sub find_goal($$) # private function 1040{ 1041 my ($ivecs, $goal) = @_; 1042 1043 my @goals; 1044 my $load; 1045 my $ivec; 1046 1047 if ($goal <= 0) { 1048 @goals = (); # the empty set will best meet the goal 1049 } else { 1050 syslog('debug', "finding goal from intrs %s", 1051 ivecs_to_string(@$ivecs)); 1052 1053 # Generate @loads array 1054 1055 my $tot = 0; 1056 foreach $ivec (@$ivecs) { 1057 $tot += $ivec->{time}; 1058 } 1059 my @loads = (); 1060 foreach $ivec (@$ivecs) { 1061 push(@loads, $tot); 1062 $tot -= $ivec->{time}; 1063 } 1064 ($load, @goals) = do_find_goal($ivecs, \@loads, $goal, 0); 1065 VERIFY($load >= $goal, "find_goal didn't meet goals"); 1066 } 1067 syslog('debug', "goals found: %s", ivecs_to_string(@goals)); 1068 1069 # Set or clear $ivec->{goal} for each ivec, based on returned @goals 1070 1071 foreach $ivec (@$ivecs) { 1072 if ($#goals > -1 && $ivec == $goals[0]) { 1073 syslog('debug', "inum $ivec->{inum} on source cpu"); 1074 $ivec->{goal} = 1; 1075 shift(@goals); 1076 } else { 1077 syslog('debug', "inum $ivec->{inum} on target cpu"); 1078 $ivec->{goal} = 0; 1079 } 1080 } 1081} 1082 1083 1084sub do_find_goal($$$$) # private function 1085{ 1086 my ($ivecs, $loads, $goal, $idx) = @_; 1087 1088 if ($idx > $#{$ivecs}) { 1089 return (0); 1090 } 1091 syslog('debug', "$idx: finding goal $goal inum $ivecs->[$idx]{inum}"); 1092 1093 my $load = $ivecs->[$idx]{time}; 1094 my @goals_with = (); 1095 my @goals_without = (); 1096 my ($with, $without); 1097 1098 # If we include all remaining items and we're still below goal, 1099 # stop here. We can just return a result that includes $idx and all 1100 # subsequent ivecs. Since this will still be below goal, there's 1101 # nothing better to be done. 1102 1103 if ($loads->[$idx] <= $goal) { 1104 syslog('debug', 1105 "$idx: including all remaining intrs %s with load %d", 1106 ivecs_to_string(@$ivecs[$idx .. $#{$ivecs}]), 1107 $loads->[$idx]); 1108 return ($loads->[$idx], @$ivecs[$idx .. $#{$ivecs}]); 1109 } 1110 1111 # Evaluate the "with" option, i.e. the best matching goal which 1112 # includes $ivecs->[$idx]. If idx's load is more than our goal load, 1113 # stop here. Once we're above the goal, there is no need to consider 1114 # further interrupts since they'll only take us further from the goal. 1115 1116 if ($goal <= $load) { 1117 $with = $load; # stop here 1118 } else { 1119 ($with, @goals_with) = 1120 do_find_goal($ivecs, $loads, $goal - $load, $idx + 1); 1121 $with += $load; 1122 } 1123 syslog('debug', "$idx: with-load $with intrs %s", 1124 ivecs_to_string($ivecs->[$idx], @goals_with)); 1125 1126 # Evaluate the "without" option, i.e. the best matching goal which 1127 # excludes $ivecs->[$idx]. 1128 1129 ($without, @goals_without) = 1130 &do_find_goal($ivecs, $loads, $goal, $idx + 1); 1131 syslog('debug', "$idx: without-load $without intrs %s", 1132 ivecs_to_string(@goals_without)); 1133 1134 # We now have our "with" and "without" options, and we choose which 1135 # best fits the goal. If one is greater than goal and the other is 1136 # below goal, we choose the one that is greater. If they are both 1137 # below goal, then we choose the one that is greater. If they are 1138 # both above goal, then we choose the smaller. 1139 1140 my $which; # 0 == with, 1 == without 1141 if ($with >= $goal && $without < $goal) { 1142 $which = 0; 1143 } elsif ($with < $goal && $without >= $goal) { 1144 $which = 1; 1145 } elsif ($with >= $goal && $without >= $goal) { 1146 $which = ($without < $with); 1147 } else { 1148 $which = ($without > $with); 1149 } 1150 1151 # Return the load of our best case scenario, followed by all the ivecs 1152 # which compose that goal. 1153 1154 if ($which == 1) { # without 1155 syslog('debug', "$idx: going without"); 1156 return ($without, @goals_without); 1157 } else { 1158 syslog('debug', "$idx: going with"); 1159 return ($with, $ivecs->[$idx], @goals_with); 1160 } 1161 # Not reached 1162} 1163 1164 1165 1166 1167syslog('debug', "intrd is starting".($debug ? " (debug)" : "")); 1168 1169my @deltas = (); 1170my $deltas_tottime = 0; # sum of maxsnap-minsnap across @deltas 1171my $avggoodness; 1172my $baseline_goodness = 0; 1173my $compdelta; 1174 1175my $do_reconfig; 1176 1177# temp variables 1178my $goodness; 1179my $deltatime; 1180my $olddelta; 1181my $olddeltatime; 1182my $delta; 1183my $newstat; 1184my $below_statslen; 1185my $newtime; 1186my $ret; 1187 1188 1189my $gotsig = 0; 1190$SIG{INT} = sub { $gotsig = 1; }; # don't die in the middle of retargeting 1191$SIG{HUP} = $SIG{INT}; 1192$SIG{TERM} = $SIG{INT}; 1193 1194my $ks; 1195if ($using_scengen == 0) { 1196 $ks = Sun::Solaris::Kstat->new(); 1197} else { 1198 $ks = myks_update(); # supplied by the simulator 1199} 1200 1201# If no pci_intrs kstats were found, we need to exit, but we can't because 1202# SMF will restart us and/or report an error to the administrator. But 1203# there's nothing an administrator can do. So print out a message for SMF 1204# logs and silently pause forever. 1205 1206if (!exists($ks->{pci_intrs})) { 1207 print STDERR "$cmdname: no interrupts were found; ". 1208 "your PCI bus may not yet be supported\n"; 1209 pause() while $gotsig == 0; 1210 exit 0; 1211} 1212 1213my $stat = getstat($ks); 1214 1215 1216 1217for (;;) { 1218 sub clear_deltas { 1219 @deltas = (); 1220 $deltas_tottime = 0; 1221 $stat = 0; # prevent next gen_delta() from setting {missing} 1222 } 1223 1224 # 1. Sleep, update the kstats, and save the new stats in $newstat. 1225 1226 exit 0 if $gotsig; # if we got ^C / SIGTERM, exit 1227 if ($using_scengen == 0) { 1228 sleep($sleeptime); 1229 exit 0 if $gotsig; # if we got ^C / SIGTERM, exit 1230 $ks->update(); 1231 } else { 1232 $ks = myks_update(); 1233 } 1234 $newstat = getstat($ks); 1235 1236 # $stat or $newstat could be zero if they're uninitialized, or if 1237 # getstat() failed. If $stat is zero, move $newstat to $stat, sleep 1238 # and try again. If $newstat is zero, then we also sleep and try 1239 # again, hoping the problem will clear up. 1240 1241 next if (!ref $newstat); 1242 if (!ref $stat) { 1243 $stat = $newstat; 1244 next; 1245 } 1246 1247 1248 # 2. Compare $newstat with the prior set of values, result in %$delta. 1249 1250 $delta = generate_delta($stat, $newstat); 1251 dumpdelta($delta) if $debug; # Dump most recent stats to stdout. 1252 $stat = $newstat; # The new stats now become the old stats. 1253 1254 1255 # 3. If $delta->{missing}, then there has been a reconfiguration of 1256 # either cpus or interrupts (probably both). We need to toss out our 1257 # old set of statistics and start from scratch. 1258 # 1259 # Also, if the delta covers a very long range of time, then we've 1260 # been experiencing a system overload that has resulted in intrd 1261 # not being allowed to run effectively for a while now. As above, 1262 # toss our old statistics and start from scratch. 1263 1264 $deltatime = $delta->{maxsnap} - $delta->{minsnap}; 1265 if ($delta->{missing} > 0 || $deltatime > $statslen) { 1266 clear_deltas(); 1267 syslog('debug', "evaluating interrupt assignments"); 1268 next; 1269 } 1270 1271 1272 # 4. Incorporate new delta into the list of deltas, and associated 1273 # statistics. If we've just now received $statslen deltas, then it's 1274 # time to evaluate a reconfiguration. 1275 1276 $below_statslen = ($deltas_tottime < $statslen); 1277 $deltas_tottime += $deltatime; 1278 $do_reconfig = ($below_statslen && $deltas_tottime >= $statslen); 1279 push(@deltas, $delta); 1280 1281 # 5. Remove old deltas if total time is more than $statslen. We use 1282 # @deltas as a moving average of the last $statslen seconds. Shift 1283 # off the olders deltas, but only if that doesn't cause us to fall 1284 # below $statslen seconds. 1285 1286 while (@deltas > 1) { 1287 $olddelta = $deltas[0]; 1288 $olddeltatime = $olddelta->{maxsnap} - $olddelta->{minsnap}; 1289 $newtime = $deltas_tottime - $olddeltatime; 1290 last if ($newtime < $statslen); 1291 1292 shift(@deltas); 1293 $deltas_tottime = $newtime; 1294 } 1295 1296 # 6. The brains of the operation are here. First, check if we're 1297 # imbalanced, and if so set $do_reconfig. If $do_reconfig is set, 1298 # either because of imbalance or above in step 4, we evaluate a 1299 # new configuration. 1300 # 1301 # First, take @deltas and generate a single "compressed" delta 1302 # which summarizes them all. Pass that to do_reconfig and see 1303 # what it does with it: 1304 # 1305 # $ret == -1 : failure 1306 # $ret == 0 : current config is optimal (or close enough) 1307 # $ret == 1 : reconfiguration has occurred 1308 # 1309 # If $ret is -1 or 1, dump all our deltas and start from scratch. 1310 # Step 4 above will set do_reconfig soon thereafter. 1311 # 1312 # If $ret is 0, then nothing has happened because we're already 1313 # good enough. Set baseline_goodness to current goodness. 1314 1315 $compdelta = compress_deltas(\@deltas); 1316 if (VERIFY(ref($compdelta) eq "HASH", "couldn't compress deltas")) { 1317 clear_deltas(); 1318 next; 1319 } 1320 $compdelta->{goodness} = goodness($compdelta); 1321 dumpdelta($compdelta) if $debug; 1322 1323 $goodness = $compdelta->{goodness}; 1324 syslog('debug', "GOODNESS: %5.2f%%", $goodness * 100); 1325 1326 if ($deltas_tottime >= $statslen && 1327 imbalanced($goodness, $baseline_goodness)) { 1328 $do_reconfig = 1; 1329 } 1330 1331 if ($do_reconfig) { 1332 $ret = do_reconfig($compdelta); 1333 1334 if ($ret != 0) { 1335 clear_deltas(); 1336 syslog('debug', "do_reconfig FAILED!") if $ret == -1; 1337 } else { 1338 syslog('debug', "setting new baseline of $goodness"); 1339 $baseline_goodness = $goodness; 1340 } 1341 } 1342 syslog('debug', "---------------------------------------"); 1343} 1344