1 #ifndef KMP_STATS_H 2 #define KMP_STATS_H 3 4 /** @file kmp_stats.h 5 * Functions for collecting statistics. 6 */ 7 8 //===----------------------------------------------------------------------===// 9 // 10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 11 // See https://llvm.org/LICENSE.txt for license information. 12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "kmp_config.h" 17 #include "kmp_debug.h" 18 19 #if KMP_STATS_ENABLED 20 /* Statistics accumulator. 21 Accumulates number of samples and computes min, max, mean, standard deviation 22 on the fly. 23 24 Online variance calculation algorithm from 25 http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm 26 */ 27 28 #include "kmp_stats_timing.h" 29 #include <limits> 30 #include <math.h> 31 #include <new> // placement new 32 #include <stdint.h> 33 #include <string> 34 #include <vector> 35 36 /* Enable developer statistics here if you want them. They are more detailed 37 than is useful for application characterisation and are intended for the 38 runtime library developer. */ 39 #define KMP_DEVELOPER_STATS 0 40 41 /* Enable/Disable histogram output */ 42 #define KMP_STATS_HIST 0 43 44 /*! 45 * @ingroup STATS_GATHERING 46 * \brief flags to describe the statistic (timer or counter) 47 * 48 */ 49 enum stats_flags_e { 50 noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic 51 onlyInMaster = 1 << 1, //!< statistic is valid only for primary thread 52 noUnits = 1 << 2, //!< statistic doesn't need units printed next to it 53 notInMaster = 1 << 3, //!< statistic is valid only for non-primary threads 54 logEvent = 1 << 4 //!< statistic can be logged on the event timeline when 55 //! KMP_STATS_EVENTS is on (valid only for timers) 56 }; 57 58 /*! 59 * @ingroup STATS_GATHERING 60 * \brief the states which a thread can be in 61 * 62 */ 63 enum stats_state_e { 64 IDLE, 65 SERIAL_REGION, 66 FORK_JOIN_BARRIER, 67 PLAIN_BARRIER, 68 TASKWAIT, 69 TASKYIELD, 70 TASKGROUP, 71 IMPLICIT_TASK, 72 EXPLICIT_TASK, 73 TEAMS_REGION 74 }; 75 76 /*! 77 * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h 78 * 79 * @param macro a user defined macro that takes three arguments - 80 * macro(COUNTER_NAME, flags, arg) 81 * @param arg a user defined argument to send to the user defined macro 82 * 83 * \details A counter counts the occurrence of some event. Each thread 84 * accumulates its own count, at the end of execution the counts are aggregated 85 * treating each thread as a separate measurement. (Unless onlyInMaster is set, 86 * in which case there's only a single measurement). The min,mean,max are 87 * therefore the values for the threads. Adding the counter here and then 88 * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you 89 * need to do. All of the tables and printing is generated from this macro. 90 * Format is "macro(name, flags, arg)" 91 * 92 * @ingroup STATS_GATHERING 93 */ 94 // clang-format off 95 #define KMP_FOREACH_COUNTER(macro, arg) \ 96 macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \ 97 macro(OMP_NESTED_PARALLEL, 0, arg) \ 98 macro(OMP_LOOP_STATIC, 0, arg) \ 99 macro(OMP_LOOP_STATIC_STEAL, 0, arg) \ 100 macro(OMP_LOOP_DYNAMIC, 0, arg) \ 101 macro(OMP_DISTRIBUTE, 0, arg) \ 102 macro(OMP_BARRIER, 0, arg) \ 103 macro(OMP_CRITICAL, 0, arg) \ 104 macro(OMP_SINGLE, 0, arg) \ 105 macro(OMP_SECTIONS, 0, arg) \ 106 macro(OMP_MASTER, 0, arg) \ 107 macro(OMP_MASKED, 0, arg) \ 108 macro(OMP_TEAMS, 0, arg) \ 109 macro(OMP_set_lock, 0, arg) \ 110 macro(OMP_test_lock, 0, arg) \ 111 macro(REDUCE_wait, 0, arg) \ 112 macro(REDUCE_nowait, 0, arg) \ 113 macro(OMP_TASKYIELD, 0, arg) \ 114 macro(OMP_TASKLOOP, 0, arg) \ 115 macro(TASK_executed, 0, arg) \ 116 macro(TASK_cancelled, 0, arg) \ 117 macro(TASK_stolen, 0, arg) 118 // clang-format on 119 120 /*! 121 * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h 122 * 123 * @param macro a user defined macro that takes three arguments - 124 * macro(TIMER_NAME, flags, arg) 125 * @param arg a user defined argument to send to the user defined macro 126 * 127 * \details A timer collects multiple samples of some count in each thread and 128 * then finally aggregates all of the samples from all of the threads. For most 129 * timers the printing code also provides an aggregation over the thread totals. 130 * These are printed as TOTAL_foo. The count is normally a time (in ticks), 131 * hence the name "timer". (But can be any value, so we use this for "number of 132 * arguments passed to fork" as well). For timers the threads are not 133 * significant, it's the individual observations that count, so the statistics 134 * are at that level. Format is "macro(name, flags, arg)" 135 * 136 * @ingroup STATS_GATHERING2 137 */ 138 // clang-format off 139 #define KMP_FOREACH_TIMER(macro, arg) \ 140 macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \ 141 macro (OMP_parallel, stats_flags_e::logEvent, arg) \ 142 macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \ 143 macro (OMP_teams, stats_flags_e::logEvent, arg) \ 144 macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \ 145 macro (OMP_loop_static, 0, arg) \ 146 macro (OMP_loop_static_scheduling, 0, arg) \ 147 macro (OMP_loop_dynamic, 0, arg) \ 148 macro (OMP_loop_dynamic_scheduling, 0, arg) \ 149 macro (OMP_distribute, 0, arg) \ 150 macro (OMP_distribute_scheduling, 0, arg) \ 151 macro (OMP_critical, 0, arg) \ 152 macro (OMP_critical_wait, 0, arg) \ 153 macro (OMP_single, 0, arg) \ 154 macro (OMP_sections, 0, arg) \ 155 macro (OMP_sections_overhead, 0, arg) \ 156 macro (OMP_master, 0, arg) \ 157 macro (OMP_masked, 0, arg) \ 158 macro (OMP_task_immediate, 0, arg) \ 159 macro (OMP_task_taskwait, 0, arg) \ 160 macro (OMP_task_taskyield, 0, arg) \ 161 macro (OMP_task_taskgroup, 0, arg) \ 162 macro (OMP_task_join_bar, 0, arg) \ 163 macro (OMP_task_plain_bar, 0, arg) \ 164 macro (OMP_taskloop_scheduling, 0, arg) \ 165 macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \ 166 macro (OMP_idle, stats_flags_e::logEvent, arg) \ 167 macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \ 168 macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \ 169 macro (OMP_serial, stats_flags_e::logEvent, arg) \ 170 macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \ 171 arg) \ 172 macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \ 173 arg) \ 174 macro (OMP_loop_static_iterations, \ 175 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 176 macro (OMP_loop_static_total_iterations, \ 177 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 178 macro (OMP_loop_dynamic_iterations, \ 179 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 180 macro (OMP_loop_dynamic_total_iterations, \ 181 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 182 macro (OMP_distribute_iterations, \ 183 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 184 KMP_FOREACH_DEVELOPER_TIMER(macro, arg) 185 // clang-format on 186 187 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either 188 // initializing OpenMP or being created by a primary 189 // thread) until the thread is destroyed 190 // OMP_parallel -- Time thread spends executing work directly 191 // within a #pragma omp parallel 192 // OMP_parallel_overhead -- Time thread spends setting up a parallel region 193 // OMP_loop_static -- Time thread spends executing loop iterations from 194 // a statically scheduled loop 195 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations 196 // from a statically scheduled loop 197 // OMP_loop_dynamic -- Time thread spends executing loop iterations from 198 // a dynamically scheduled loop 199 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations 200 // from a dynamically scheduled loop 201 // OMP_critical -- Time thread spends executing critical section 202 // OMP_critical_wait -- Time thread spends waiting to enter 203 // a critical section 204 // OMP_single -- Time spent executing a "single" region 205 // OMP_master -- Time spent executing a "master" region 206 // OMP_masked -- Time spent executing a "masked" region 207 // OMP_task_immediate -- Time spent executing non-deferred tasks 208 // OMP_task_taskwait -- Time spent executing tasks inside a taskwait 209 // construct 210 // OMP_task_taskyield -- Time spent executing tasks inside a taskyield 211 // construct 212 // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup 213 // construct 214 // OMP_task_join_bar -- Time spent executing tasks inside a join barrier 215 // OMP_task_plain_bar -- Time spent executing tasks inside a barrier 216 // construct 217 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop 218 // construct 219 // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or 220 // inside implicit barrier at end of worksharing 221 // construct 222 // OMP_idle -- Time worker threads spend waiting for next 223 // parallel region 224 // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a 225 // parallel region 226 // OMP_join_barrier -- Time spent in a the join barrier surrounding a 227 // parallel region 228 // OMP_serial -- Time thread zero spends executing serial code 229 // OMP_set_numthreads -- Values passed to omp_set_num_threads 230 // OMP_PARALLEL_args -- Number of arguments passed to a parallel region 231 // OMP_loop_static_iterations -- Number of iterations thread is assigned for 232 // statically scheduled loops 233 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for 234 // dynamically scheduled loops 235 236 #if (KMP_DEVELOPER_STATS) 237 // Timers which are of interest to runtime library developers, not end users. 238 // These have to be explicitly enabled in addition to the other stats. 239 240 // KMP_fork_barrier -- time in __kmp_fork_barrier 241 // KMP_join_barrier -- time in __kmp_join_barrier 242 // KMP_barrier -- time in __kmp_barrier 243 // KMP_end_split_barrier -- time in __kmp_end_split_barrier 244 // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy 245 // KMP_icv_copy -- start/stop timer for any ICV copying 246 // KMP_linear_gather -- time in __kmp_linear_barrier_gather 247 // KMP_linear_release -- time in __kmp_linear_barrier_release 248 // KMP_tree_gather -- time in __kmp_tree_barrier_gather 249 // KMP_tree_release -- time in __kmp_tree_barrier_release 250 // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather 251 // KMP_hyper_release -- time in __kmp_hyper_barrier_release 252 // KMP_dist_gather -- time in __kmp_dist_barrier_gather 253 // KMP_dist_release -- time in __kmp_dist_barrier_release 254 // clang-format off 255 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \ 256 macro(KMP_fork_call, 0, arg) \ 257 macro(KMP_join_call, 0, arg) \ 258 macro(KMP_end_split_barrier, 0, arg) \ 259 macro(KMP_hier_gather, 0, arg) \ 260 macro(KMP_hier_release, 0, arg) \ 261 macro(KMP_hyper_gather, 0, arg) \ 262 macro(KMP_hyper_release, 0, arg) \ 263 macro(KMP_dist_gather, 0, arg) \ 264 macro(KMP_dist_release, 0, arg) \ 265 macro(KMP_linear_gather, 0, arg) \ 266 macro(KMP_linear_release, 0, arg) \ 267 macro(KMP_tree_gather, 0, arg) \ 268 macro(KMP_tree_release, 0, arg) \ 269 macro(USER_resume, 0, arg) \ 270 macro(USER_suspend, 0, arg) \ 271 macro(USER_mwait, 0, arg) \ 272 macro(KMP_allocate_team, 0, arg) \ 273 macro(KMP_setup_icv_copy, 0, arg) \ 274 macro(USER_icv_copy, 0, arg) \ 275 macro (FOR_static_steal_stolen, \ 276 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 277 macro (FOR_static_steal_chunks, \ 278 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) 279 #else 280 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) 281 #endif 282 // clang-format on 283 284 /*! 285 * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro. 286 * 287 * @param macro a user defined macro that takes three arguments - 288 * macro(TIMER_NAME, flags, arg) 289 * @param arg a user defined argument to send to the user defined macro 290 * 291 * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE 292 * BAD THINGS WILL HAPPEN! 293 * 294 * \details Explicit timers are ones where we need to allocate a timer itself 295 * (as well as the accumulated timing statistics). We allocate these on a 296 * per-thread basis, and explicitly start and stop them. Block timers just 297 * allocate the timer itself on the stack, and use the destructor to notice 298 * block exit; they don't need to be defined here. The name here should be the 299 * same as that of a timer above. 300 * 301 * @ingroup STATS_GATHERING 302 */ 303 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg) 304 305 #define ENUMERATE(name, ignore, prefix) prefix##name, 306 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST }; 307 308 enum explicit_timer_e { 309 KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST 310 }; 311 312 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST }; 313 #undef ENUMERATE 314 315 /* 316 * A logarithmic histogram. It accumulates the number of values in each power of 317 * ten bin. So 1<=x<10, 10<=x<100, ... 318 * Mostly useful where we have some big outliers and want to see information 319 * about them. 320 */ 321 class logHistogram { 322 enum { 323 numBins = 31, /* Number of powers of 10. If this changes you need to change 324 * the initializer for binMax */ 325 326 /* 327 * If you want to use this to analyse values that may be less than 1, (for 328 * instance times in s), then the logOffset gives you negative powers. 329 * In our case here, we're just looking at times in ticks, or counts, so we 330 * can never see values with magnitude < 1 (other than zero), so we can set 331 * it to 0. As above change the initializer if you change this. 332 */ 333 logOffset = 0 334 }; 335 uint32_t KMP_ALIGN_CACHE zeroCount; 336 struct { 337 uint32_t count; 338 double total; 339 } bins[numBins]; 340 341 static double binMax[numBins]; 342 343 #ifdef KMP_DEBUG 344 uint64_t _total; 345 346 void check() const { 347 uint64_t t = zeroCount; 348 for (int i = 0; i < numBins; i++) 349 t += bins[i].count; 350 KMP_DEBUG_ASSERT(t == _total); 351 } 352 #else 353 void check() const {} 354 #endif 355 356 public: 357 logHistogram() { reset(); } 358 359 logHistogram(logHistogram const &o) { 360 for (int i = 0; i < numBins; i++) 361 bins[i] = o.bins[i]; 362 #ifdef KMP_DEBUG 363 _total = o._total; 364 #endif 365 } 366 367 void reset() { 368 zeroCount = 0; 369 for (int i = 0; i < numBins; i++) { 370 bins[i].count = 0; 371 bins[i].total = 0; 372 } 373 374 #ifdef KMP_DEBUG 375 _total = 0; 376 #endif 377 } 378 uint32_t count(int b) const { return bins[b + logOffset].count; } 379 double total(int b) const { return bins[b + logOffset].total; } 380 static uint32_t findBin(double sample); 381 382 logHistogram &operator+=(logHistogram const &o) { 383 zeroCount += o.zeroCount; 384 for (int i = 0; i < numBins; i++) { 385 bins[i].count += o.bins[i].count; 386 bins[i].total += o.bins[i].total; 387 } 388 #ifdef KMP_DEBUG 389 _total += o._total; 390 check(); 391 #endif 392 393 return *this; 394 } 395 396 void addSample(double sample); 397 int minBin() const; 398 int maxBin() const; 399 400 std::string format(char) const; 401 }; 402 403 class statistic { 404 double KMP_ALIGN_CACHE minVal; 405 double maxVal; 406 double meanVal; 407 double m2; 408 uint64_t sampleCount; 409 double offset; 410 bool collectingHist; 411 logHistogram hist; 412 413 public: 414 statistic(bool doHist = bool(KMP_STATS_HIST)) { 415 reset(); 416 collectingHist = doHist; 417 } 418 statistic(statistic const &o) 419 : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2), 420 sampleCount(o.sampleCount), offset(o.offset), 421 collectingHist(o.collectingHist), hist(o.hist) {} 422 statistic(double minv, double maxv, double meanv, uint64_t sc, double sd) 423 : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc), 424 sampleCount(sc), offset(0.0), collectingHist(false) {} 425 bool haveHist() const { return collectingHist; } 426 double getMin() const { return minVal; } 427 double getMean() const { return meanVal; } 428 double getMax() const { return maxVal; } 429 uint64_t getCount() const { return sampleCount; } 430 double getSD() const { return sqrt(m2 / sampleCount); } 431 double getTotal() const { return sampleCount * meanVal; } 432 logHistogram const *getHist() const { return &hist; } 433 void setOffset(double d) { offset = d; } 434 435 void reset() { 436 minVal = (std::numeric_limits<double>::max)(); 437 maxVal = -minVal; 438 meanVal = 0.0; 439 m2 = 0.0; 440 sampleCount = 0; 441 offset = 0.0; 442 hist.reset(); 443 } 444 void addSample(double sample); 445 void scale(double factor); 446 void scaleDown(double f) { scale(1. / f); } 447 void forceCount(uint64_t count) { sampleCount = count; } 448 statistic &operator+=(statistic const &other); 449 450 std::string format(char unit, bool total = false) const; 451 std::string formatHist(char unit) const { return hist.format(unit); } 452 }; 453 454 struct statInfo { 455 const char *name; 456 uint32_t flags; 457 }; 458 459 class timeStat : public statistic { 460 static statInfo timerInfo[]; 461 462 public: 463 timeStat() : statistic() {} 464 static const char *name(timer_e e) { return timerInfo[e].name; } 465 static bool noTotal(timer_e e) { 466 return timerInfo[e].flags & stats_flags_e::noTotal; 467 } 468 static bool masterOnly(timer_e e) { 469 return timerInfo[e].flags & stats_flags_e::onlyInMaster; 470 } 471 static bool workerOnly(timer_e e) { 472 return timerInfo[e].flags & stats_flags_e::notInMaster; 473 } 474 static bool noUnits(timer_e e) { 475 return timerInfo[e].flags & stats_flags_e::noUnits; 476 } 477 static bool logEvent(timer_e e) { 478 return timerInfo[e].flags & stats_flags_e::logEvent; 479 } 480 static void clearEventFlags() { 481 for (int i = 0; i < TIMER_LAST; i++) { 482 timerInfo[i].flags &= (~(stats_flags_e::logEvent)); 483 } 484 } 485 }; 486 487 // Where we need explicitly to start and end the timer, this version can be used 488 // Since these timers normally aren't nicely scoped, so don't have a good place 489 // to live on the stack of the thread, they're more work to use. 490 class explicitTimer { 491 timeStat *stat; 492 timer_e timerEnumValue; 493 tsc_tick_count startTime; 494 tsc_tick_count pauseStartTime; 495 tsc_tick_count::tsc_interval_t totalPauseTime; 496 497 public: 498 explicitTimer(timeStat *s, timer_e te) 499 : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0), 500 totalPauseTime() {} 501 502 // void setStat(timeStat *s) { stat = s; } 503 void start(tsc_tick_count tick); 504 void pause(tsc_tick_count tick) { pauseStartTime = tick; } 505 void resume(tsc_tick_count tick) { 506 totalPauseTime += (tick - pauseStartTime); 507 } 508 void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr); 509 void reset() { 510 startTime = 0; 511 pauseStartTime = 0; 512 totalPauseTime = 0; 513 } 514 timer_e get_type() const { return timerEnumValue; } 515 }; 516 517 // Where you need to partition a threads clock ticks into separate states 518 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and 519 // DOING_NOTHING would render these conditions: 520 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive 521 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice 522 // versa 523 class partitionedTimers { 524 private: 525 std::vector<explicitTimer> timer_stack; 526 527 public: 528 partitionedTimers(); 529 void init(explicitTimer timer); 530 void exchange(explicitTimer timer); 531 void push(explicitTimer timer); 532 void pop(); 533 void windup(); 534 }; 535 536 // Special wrapper around the partitioned timers to aid timing code blocks 537 // It avoids the need to have an explicit end, leaving the scope suffices. 538 class blockPartitionedTimer { 539 partitionedTimers *part_timers; 540 541 public: 542 blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer) 543 : part_timers(pt) { 544 part_timers->push(timer); 545 } 546 ~blockPartitionedTimer() { part_timers->pop(); } 547 }; 548 549 // Special wrapper around the thread state to aid in keeping state in code 550 // blocks It avoids the need to have an explicit end, leaving the scope 551 // suffices. 552 class blockThreadState { 553 stats_state_e *state_pointer; 554 stats_state_e old_state; 555 556 public: 557 blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state) 558 : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) { 559 *state_pointer = new_state; 560 } 561 ~blockThreadState() { *state_pointer = old_state; } 562 }; 563 564 // If all you want is a count, then you can use this... 565 // The individual per-thread counts will be aggregated into a statistic at 566 // program exit. 567 class counter { 568 uint64_t value; 569 static const statInfo counterInfo[]; 570 571 public: 572 counter() : value(0) {} 573 void increment() { value++; } 574 uint64_t getValue() const { return value; } 575 void reset() { value = 0; } 576 static const char *name(counter_e e) { return counterInfo[e].name; } 577 static bool masterOnly(counter_e e) { 578 return counterInfo[e].flags & stats_flags_e::onlyInMaster; 579 } 580 }; 581 582 /* **************************************************************** 583 Class to implement an event 584 585 There are four components to an event: start time, stop time 586 nest_level, and timer_name. 587 The start and stop time should be obvious (recorded in clock ticks). 588 The nest_level relates to the bar width in the timeline graph. 589 The timer_name is used to determine which timer event triggered this event. 590 591 the interface to this class is through four read-only operations: 592 1) getStart() -- returns the start time as 64 bit integer 593 2) getStop() -- returns the stop time as 64 bit integer 594 3) getNestLevel() -- returns the nest level of the event 595 4) getTimerName() -- returns the timer name that triggered event 596 597 *MORE ON NEST_LEVEL* 598 The nest level is used in the bar graph that represents the timeline. 599 Its main purpose is for showing how events are nested inside eachother. 600 For example, say events, A, B, and C are recorded. If the timeline 601 looks like this: 602 603 Begin -------------------------------------------------------------> Time 604 | | | | | | 605 A B C C B A 606 start start start end end end 607 608 Then A, B, C will have a nest level of 1, 2, 3 respectively. 609 These values are then used to calculate the barwidth so you can 610 see that inside A, B has occurred, and inside B, C has occurred. 611 Currently, this is shown with A's bar width being larger than B's 612 bar width, and B's bar width being larger than C's bar width. 613 614 **************************************************************** */ 615 class kmp_stats_event { 616 uint64_t start; 617 uint64_t stop; 618 int nest_level; 619 timer_e timer_name; 620 621 public: 622 kmp_stats_event() 623 : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {} 624 kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme) 625 : start(strt), stop(stp), nest_level(nst), timer_name(nme) {} 626 inline uint64_t getStart() const { return start; } 627 inline uint64_t getStop() const { return stop; } 628 inline int getNestLevel() const { return nest_level; } 629 inline timer_e getTimerName() const { return timer_name; } 630 }; 631 632 /* **************************************************************** 633 Class to implement a dynamically expandable array of events 634 635 --------------------------------------------------------- 636 | event 1 | event 2 | event 3 | event 4 | ... | event N | 637 --------------------------------------------------------- 638 639 An event is pushed onto the back of this array at every 640 explicitTimer->stop() call. The event records the thread #, 641 start time, stop time, and nest level related to the bar width. 642 643 The event vector starts at size INIT_SIZE and grows (doubles in size) 644 if needed. An implication of this behavior is that log(N) 645 reallocations are needed (where N is number of events). If you want 646 to avoid reallocations, then set INIT_SIZE to a large value. 647 648 the interface to this class is through six operations: 649 1) reset() -- sets the internal_size back to 0 but does not deallocate any 650 memory 651 2) size() -- returns the number of valid elements in the vector 652 3) push_back(start, stop, nest, timer_name) -- pushes an event onto 653 the back of the array 654 4) deallocate() -- frees all memory associated with the vector 655 5) sort() -- sorts the vector by start time 656 6) operator[index] or at(index) -- returns event reference at that index 657 **************************************************************** */ 658 class kmp_stats_event_vector { 659 kmp_stats_event *events; 660 int internal_size; 661 int allocated_size; 662 static const int INIT_SIZE = 1024; 663 664 public: 665 kmp_stats_event_vector() { 666 events = 667 (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE); 668 internal_size = 0; 669 allocated_size = INIT_SIZE; 670 } 671 ~kmp_stats_event_vector() {} 672 inline void reset() { internal_size = 0; } 673 inline int size() const { return internal_size; } 674 void push_back(uint64_t start_time, uint64_t stop_time, int nest_level, 675 timer_e name) { 676 int i; 677 if (internal_size == allocated_size) { 678 kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate( 679 sizeof(kmp_stats_event) * allocated_size * 2); 680 for (i = 0; i < internal_size; i++) 681 tmp[i] = events[i]; 682 __kmp_free(events); 683 events = tmp; 684 allocated_size *= 2; 685 } 686 events[internal_size] = 687 kmp_stats_event(start_time, stop_time, nest_level, name); 688 internal_size++; 689 return; 690 } 691 void deallocate(); 692 void sort(); 693 const kmp_stats_event &operator[](int index) const { return events[index]; } 694 kmp_stats_event &operator[](int index) { return events[index]; } 695 const kmp_stats_event &at(int index) const { return events[index]; } 696 kmp_stats_event &at(int index) { return events[index]; } 697 }; 698 699 /* **************************************************************** 700 Class to implement a doubly-linked, circular, statistics list 701 702 |---| ---> |---| ---> |---| ---> |---| ---> ... next 703 | | | | | | | | 704 |---| <--- |---| <--- |---| <--- |---| <--- ... prev 705 Sentinel first second third 706 Node node node node 707 708 The Sentinel Node is the user handle on the list. 709 The first node corresponds to thread 0's statistics. 710 The second node corresponds to thread 1's statistics and so on... 711 712 Each node has a _timers, _counters, and _explicitTimers array to hold that 713 thread's statistics. The _explicitTimers point to the correct _timer and 714 update its statistics at every stop() call. The explicitTimers' pointers are 715 set up in the constructor. Each node also has an event vector to hold that 716 thread's timing events. The event vector expands as necessary and records 717 the start-stop times for each timer. 718 719 The nestLevel variable is for plotting events and is related 720 to the bar width in the timeline graph. 721 722 Every thread will have a thread local pointer to its node in 723 the list. The sentinel node is used by the primary thread to 724 store "dummy" statistics before __kmp_create_worker() is called. 725 **************************************************************** */ 726 class kmp_stats_list { 727 int gtid; 728 timeStat _timers[TIMER_LAST + 1]; 729 counter _counters[COUNTER_LAST + 1]; 730 explicitTimer thread_life_timer; 731 partitionedTimers _partitionedTimers; 732 int _nestLevel; // one per thread 733 kmp_stats_event_vector _event_vector; 734 kmp_stats_list *next; 735 kmp_stats_list *prev; 736 stats_state_e state; 737 int thread_is_idle_flag; 738 739 public: 740 kmp_stats_list() 741 : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life], 742 TIMER_OMP_worker_thread_life), 743 _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE), 744 thread_is_idle_flag(0) {} 745 ~kmp_stats_list() {} 746 inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; } 747 inline counter *getCounter(counter_e idx) { return &_counters[idx]; } 748 inline partitionedTimers *getPartitionedTimers() { 749 return &_partitionedTimers; 750 } 751 inline timeStat *getTimers() { return _timers; } 752 inline counter *getCounters() { return _counters; } 753 inline kmp_stats_event_vector &getEventVector() { return _event_vector; } 754 inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); } 755 inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); } 756 inline void resetEventVector() { _event_vector.reset(); } 757 inline void incrementNestValue() { _nestLevel++; } 758 inline int getNestValue() { return _nestLevel; } 759 inline void decrementNestValue() { _nestLevel--; } 760 inline int getGtid() const { return gtid; } 761 inline void setGtid(int newgtid) { gtid = newgtid; } 762 inline void setState(stats_state_e newstate) { state = newstate; } 763 inline stats_state_e getState() const { return state; } 764 inline stats_state_e *getStatePointer() { return &state; } 765 inline bool isIdle() { return thread_is_idle_flag == 1; } 766 inline void setIdleFlag() { thread_is_idle_flag = 1; } 767 inline void resetIdleFlag() { thread_is_idle_flag = 0; } 768 kmp_stats_list *push_back(int gtid); // returns newly created list node 769 inline void push_event(uint64_t start_time, uint64_t stop_time, 770 int nest_level, timer_e name) { 771 _event_vector.push_back(start_time, stop_time, nest_level, name); 772 } 773 void deallocate(); 774 class iterator; 775 kmp_stats_list::iterator begin(); 776 kmp_stats_list::iterator end(); 777 int size(); 778 class iterator { 779 kmp_stats_list *ptr; 780 friend kmp_stats_list::iterator kmp_stats_list::begin(); 781 friend kmp_stats_list::iterator kmp_stats_list::end(); 782 783 public: 784 iterator(); 785 ~iterator(); 786 iterator operator++(); 787 iterator operator++(int dummy); 788 iterator operator--(); 789 iterator operator--(int dummy); 790 bool operator!=(const iterator &rhs); 791 bool operator==(const iterator &rhs); 792 kmp_stats_list *operator*() const; // dereference operator 793 }; 794 }; 795 796 /* **************************************************************** 797 Class to encapsulate all output functions and the environment variables 798 799 This module holds filenames for various outputs (normal stats, events, plot 800 file), as well as coloring information for the plot file. 801 802 The filenames and flags variables are read from environment variables. 803 These are read once by the constructor of the global variable 804 __kmp_stats_output which calls init(). 805 806 During this init() call, event flags for the timeStat::timerInfo[] global 807 array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes). 808 809 The only interface function that is public is outputStats(heading). This 810 function should print out everything it needs to, either to files or stderr, 811 depending on the environment variables described below 812 813 ENVIRONMENT VARIABLES: 814 KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this 815 file, otherwise, print to stderr 816 KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to 817 either KMP_STATS_FILE or stderr 818 KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename, 819 otherwise, the plot file is sent to "events.plt" 820 KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log 821 events 822 KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file, 823 otherwise, output is sent to "events.dat" 824 **************************************************************** */ 825 class kmp_stats_output_module { 826 827 public: 828 struct rgb_color { 829 float r; 830 float g; 831 float b; 832 }; 833 834 private: 835 std::string outputFileName; 836 static const char *eventsFileName; 837 static const char *plotFileName; 838 static int printPerThreadFlag; 839 static int printPerThreadEventsFlag; 840 static const rgb_color globalColorArray[]; 841 static rgb_color timerColorInfo[]; 842 843 void init(); 844 static void setupEventColors(); 845 static void printPloticusFile(); 846 static void printHeaderInfo(FILE *statsOut); 847 static void printTimerStats(FILE *statsOut, statistic const *theStats, 848 statistic const *totalStats); 849 static void printCounterStats(FILE *statsOut, statistic const *theStats); 850 static void printCounters(FILE *statsOut, counter const *theCounters); 851 static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents, 852 int gtid); 853 static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; } 854 static void windupExplicitTimers(); 855 bool eventPrintingEnabled() const { return printPerThreadEventsFlag; } 856 857 public: 858 kmp_stats_output_module() { init(); } 859 void outputStats(const char *heading); 860 }; 861 862 #ifdef __cplusplus 863 extern "C" { 864 #endif 865 void __kmp_stats_init(); 866 void __kmp_stats_fini(); 867 void __kmp_reset_stats(); 868 void __kmp_output_stats(const char *); 869 void __kmp_accumulate_stats_at_exit(void); 870 // thread local pointer to stats node within list 871 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr; 872 // head to stats list. 873 extern kmp_stats_list *__kmp_stats_list; 874 // lock for __kmp_stats_list 875 extern kmp_tas_lock_t __kmp_stats_lock; 876 // reference start time 877 extern tsc_tick_count __kmp_stats_start_time; 878 // interface to output 879 extern kmp_stats_output_module __kmp_stats_output; 880 881 #ifdef __cplusplus 882 } 883 #endif 884 885 // Simple, standard interfaces that drop out completely if stats aren't enabled 886 887 /*! 888 * \brief Adds value to specified timer (name). 889 * 890 * @param name timer name as specified under the KMP_FOREACH_TIMER() macro 891 * @param value double precision sample value to add to statistics for the timer 892 * 893 * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to 894 * a timer statistics. 895 * 896 * @ingroup STATS_GATHERING 897 */ 898 #define KMP_COUNT_VALUE(name, value) \ 899 __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value) 900 901 /*! 902 * \brief Increments specified counter (name). 903 * 904 * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro 905 * 906 * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics 907 * counter for the executing thread. 908 * 909 * @ingroup STATS_GATHERING 910 */ 911 #define KMP_COUNT_BLOCK(name) \ 912 __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment() 913 914 /*! 915 * \brief Outputs the current thread statistics and reset them. 916 * 917 * @param heading_string heading put above the final stats output 918 * 919 * \details Explicitly stops all timers and outputs all stats. Environment 920 * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a 921 * filename instead of stderr. Environment variable, 922 * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific 923 * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be 924 * defined with any value, which will print out thread specific stats, or it can 925 * be undefined (not specified in the environment) and thread specific stats 926 * won't be printed. It should be noted that all statistics are reset when this 927 * macro is called. 928 * 929 * @ingroup STATS_GATHERING 930 */ 931 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string) 932 933 /*! 934 * \brief Initializes the partitioned timers to begin with name. 935 * 936 * @param name timer which you want this thread to begin with 937 * 938 * @ingroup STATS_GATHERING 939 */ 940 #define KMP_INIT_PARTITIONED_TIMERS(name) \ 941 __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \ 942 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 943 944 #define KMP_TIME_PARTITIONED_BLOCK(name) \ 945 blockPartitionedTimer __PBLOCKTIME__( \ 946 __kmp_stats_thread_ptr->getPartitionedTimers(), \ 947 explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \ 948 TIMER_##name)) 949 950 #define KMP_PUSH_PARTITIONED_TIMER(name) \ 951 __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \ 952 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 953 954 #define KMP_POP_PARTITIONED_TIMER() \ 955 __kmp_stats_thread_ptr->getPartitionedTimers()->pop() 956 957 #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \ 958 __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \ 959 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 960 961 #define KMP_SET_THREAD_STATE(state_name) \ 962 __kmp_stats_thread_ptr->setState(state_name) 963 964 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState() 965 966 #define KMP_SET_THREAD_STATE_BLOCK(state_name) \ 967 blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \ 968 state_name) 969 970 /*! 971 * \brief resets all stats (counters to 0, timers to 0 elapsed ticks) 972 * 973 * \details Reset all stats for all threads. 974 * 975 * @ingroup STATS_GATHERING 976 */ 977 #define KMP_RESET_STATS() __kmp_reset_stats() 978 979 #if (KMP_DEVELOPER_STATS) 980 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v) 981 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n) 982 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n) 983 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n) 984 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n) 985 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \ 986 KMP_EXCHANGE_PARTITIONED_TIMER(n) 987 #else 988 // Null definitions 989 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) 990 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) 991 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) 992 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 993 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 994 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 995 #endif 996 997 #else // KMP_STATS_ENABLED 998 999 // Null definitions 1000 #define KMP_COUNT_VALUE(n, v) ((void)0) 1001 #define KMP_COUNT_BLOCK(n) ((void)0) 1002 1003 #define KMP_OUTPUT_STATS(heading_string) ((void)0) 1004 #define KMP_RESET_STATS() ((void)0) 1005 1006 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) 1007 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) 1008 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) 1009 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 1010 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 1011 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 1012 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0) 1013 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0) 1014 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0) 1015 #define KMP_POP_PARTITIONED_TIMER() ((void)0) 1016 #define KMP_SET_THREAD_STATE(state_name) ((void)0) 1017 #define KMP_GET_THREAD_STATE() ((void)0) 1018 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0) 1019 #endif // KMP_STATS_ENABLED 1020 1021 #endif // KMP_STATS_H 1022