1 #ifndef KMP_STATS_H 2 #define KMP_STATS_H 3 4 /** @file kmp_stats.h 5 * Functions for collecting statistics. 6 */ 7 8 //===----------------------------------------------------------------------===// 9 // 10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 11 // See https://llvm.org/LICENSE.txt for license information. 12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "kmp_config.h" 17 #include "kmp_debug.h" 18 19 #if KMP_STATS_ENABLED 20 /* Statistics accumulator. 21 Accumulates number of samples and computes min, max, mean, standard deviation 22 on the fly. 23 24 Online variance calculation algorithm from 25 http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm 26 */ 27 28 #include "kmp_stats_timing.h" 29 #include <limits> 30 #include <math.h> 31 #include <new> // placement new 32 #include <stdint.h> 33 #include <string> 34 #include <vector> 35 36 /* Enable developer statistics here if you want them. They are more detailed 37 than is useful for application characterisation and are intended for the 38 runtime library developer. */ 39 #define KMP_DEVELOPER_STATS 0 40 41 /* Enable/Disable histogram output */ 42 #define KMP_STATS_HIST 0 43 44 /*! 45 * @ingroup STATS_GATHERING 46 * \brief flags to describe the statistic (timer or counter) 47 * 48 */ 49 enum stats_flags_e { 50 noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic 51 onlyInMaster = 1 << 1, //!< statistic is valid only for primary thread 52 noUnits = 1 << 2, //!< statistic doesn't need units printed next to it 53 notInMaster = 1 << 3, //!< statistic is valid only for non-primary threads 54 logEvent = 1 << 4 //!< statistic can be logged on the event timeline when 55 //! KMP_STATS_EVENTS is on (valid only for timers) 56 }; 57 58 /*! 59 * @ingroup STATS_GATHERING 60 * \brief the states which a thread can be in 61 * 62 */ 63 enum stats_state_e { 64 IDLE, 65 SERIAL_REGION, 66 FORK_JOIN_BARRIER, 67 PLAIN_BARRIER, 68 TASKWAIT, 69 TASKYIELD, 70 TASKGROUP, 71 IMPLICIT_TASK, 72 EXPLICIT_TASK, 73 TEAMS_REGION 74 }; 75 76 /*! 77 * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h 78 * 79 * @param macro a user defined macro that takes three arguments - 80 * macro(COUNTER_NAME, flags, arg) 81 * @param arg a user defined argument to send to the user defined macro 82 * 83 * \details A counter counts the occurrence of some event. Each thread 84 * accumulates its own count, at the end of execution the counts are aggregated 85 * treating each thread as a separate measurement. (Unless onlyInMaster is set, 86 * in which case there's only a single measurement). The min,mean,max are 87 * therefore the values for the threads. Adding the counter here and then 88 * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you 89 * need to do. All of the tables and printing is generated from this macro. 90 * Format is "macro(name, flags, arg)" 91 * 92 * @ingroup STATS_GATHERING 93 */ 94 // clang-format off 95 #define KMP_FOREACH_COUNTER(macro, arg) \ 96 macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \ 97 macro(OMP_NESTED_PARALLEL, 0, arg) \ 98 macro(OMP_LOOP_STATIC, 0, arg) \ 99 macro(OMP_LOOP_STATIC_STEAL, 0, arg) \ 100 macro(OMP_LOOP_DYNAMIC, 0, arg) \ 101 macro(OMP_DISTRIBUTE, 0, arg) \ 102 macro(OMP_BARRIER, 0, arg) \ 103 macro(OMP_CRITICAL, 0, arg) \ 104 macro(OMP_SINGLE, 0, arg) \ 105 macro(OMP_MASTER, 0, arg) \ 106 macro(OMP_MASKED, 0, arg) \ 107 macro(OMP_TEAMS, 0, arg) \ 108 macro(OMP_set_lock, 0, arg) \ 109 macro(OMP_test_lock, 0, arg) \ 110 macro(REDUCE_wait, 0, arg) \ 111 macro(REDUCE_nowait, 0, arg) \ 112 macro(OMP_TASKYIELD, 0, arg) \ 113 macro(OMP_TASKLOOP, 0, arg) \ 114 macro(TASK_executed, 0, arg) \ 115 macro(TASK_cancelled, 0, arg) \ 116 macro(TASK_stolen, 0, arg) 117 // clang-format on 118 119 /*! 120 * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h 121 * 122 * @param macro a user defined macro that takes three arguments - 123 * macro(TIMER_NAME, flags, arg) 124 * @param arg a user defined argument to send to the user defined macro 125 * 126 * \details A timer collects multiple samples of some count in each thread and 127 * then finally aggregates all of the samples from all of the threads. For most 128 * timers the printing code also provides an aggregation over the thread totals. 129 * These are printed as TOTAL_foo. The count is normally a time (in ticks), 130 * hence the name "timer". (But can be any value, so we use this for "number of 131 * arguments passed to fork" as well). For timers the threads are not 132 * significant, it's the individual observations that count, so the statistics 133 * are at that level. Format is "macro(name, flags, arg)" 134 * 135 * @ingroup STATS_GATHERING2 136 */ 137 // clang-format off 138 #define KMP_FOREACH_TIMER(macro, arg) \ 139 macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \ 140 macro (OMP_parallel, stats_flags_e::logEvent, arg) \ 141 macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \ 142 macro (OMP_teams, stats_flags_e::logEvent, arg) \ 143 macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \ 144 macro (OMP_loop_static, 0, arg) \ 145 macro (OMP_loop_static_scheduling, 0, arg) \ 146 macro (OMP_loop_dynamic, 0, arg) \ 147 macro (OMP_loop_dynamic_scheduling, 0, arg) \ 148 macro (OMP_distribute, 0, arg) \ 149 macro (OMP_distribute_scheduling, 0, arg) \ 150 macro (OMP_critical, 0, arg) \ 151 macro (OMP_critical_wait, 0, arg) \ 152 macro (OMP_single, 0, arg) \ 153 macro (OMP_master, 0, arg) \ 154 macro (OMP_masked, 0, arg) \ 155 macro (OMP_task_immediate, 0, arg) \ 156 macro (OMP_task_taskwait, 0, arg) \ 157 macro (OMP_task_taskyield, 0, arg) \ 158 macro (OMP_task_taskgroup, 0, arg) \ 159 macro (OMP_task_join_bar, 0, arg) \ 160 macro (OMP_task_plain_bar, 0, arg) \ 161 macro (OMP_taskloop_scheduling, 0, arg) \ 162 macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \ 163 macro (OMP_idle, stats_flags_e::logEvent, arg) \ 164 macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \ 165 macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \ 166 macro (OMP_serial, stats_flags_e::logEvent, arg) \ 167 macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \ 168 arg) \ 169 macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \ 170 arg) \ 171 macro (OMP_loop_static_iterations, \ 172 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 173 macro (OMP_loop_static_total_iterations, \ 174 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 175 macro (OMP_loop_dynamic_iterations, \ 176 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 177 macro (OMP_loop_dynamic_total_iterations, \ 178 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 179 macro (OMP_distribute_iterations, \ 180 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 181 KMP_FOREACH_DEVELOPER_TIMER(macro, arg) 182 // clang-format on 183 184 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either 185 // initializing OpenMP or being created by a primary 186 // thread) until the thread is destroyed 187 // OMP_parallel -- Time thread spends executing work directly 188 // within a #pragma omp parallel 189 // OMP_parallel_overhead -- Time thread spends setting up a parallel region 190 // OMP_loop_static -- Time thread spends executing loop iterations from 191 // a statically scheduled loop 192 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations 193 // from a statically scheduled loop 194 // OMP_loop_dynamic -- Time thread spends executing loop iterations from 195 // a dynamically scheduled loop 196 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations 197 // from a dynamically scheduled loop 198 // OMP_critical -- Time thread spends executing critical section 199 // OMP_critical_wait -- Time thread spends waiting to enter 200 // a critical section 201 // OMP_single -- Time spent executing a "single" region 202 // OMP_master -- Time spent executing a "master" region 203 // OMP_masked -- Time spent executing a "masked" region 204 // OMP_task_immediate -- Time spent executing non-deferred tasks 205 // OMP_task_taskwait -- Time spent executing tasks inside a taskwait 206 // construct 207 // OMP_task_taskyield -- Time spent executing tasks inside a taskyield 208 // construct 209 // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup 210 // construct 211 // OMP_task_join_bar -- Time spent executing tasks inside a join barrier 212 // OMP_task_plain_bar -- Time spent executing tasks inside a barrier 213 // construct 214 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop 215 // construct 216 // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or 217 // inside implicit barrier at end of worksharing 218 // construct 219 // OMP_idle -- Time worker threads spend waiting for next 220 // parallel region 221 // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a 222 // parallel region 223 // OMP_join_barrier -- Time spent in a the join barrier surrounding a 224 // parallel region 225 // OMP_serial -- Time thread zero spends executing serial code 226 // OMP_set_numthreads -- Values passed to omp_set_num_threads 227 // OMP_PARALLEL_args -- Number of arguments passed to a parallel region 228 // OMP_loop_static_iterations -- Number of iterations thread is assigned for 229 // statically scheduled loops 230 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for 231 // dynamically scheduled loops 232 233 #if (KMP_DEVELOPER_STATS) 234 // Timers which are of interest to runtime library developers, not end users. 235 // These have to be explicitly enabled in addition to the other stats. 236 237 // KMP_fork_barrier -- time in __kmp_fork_barrier 238 // KMP_join_barrier -- time in __kmp_join_barrier 239 // KMP_barrier -- time in __kmp_barrier 240 // KMP_end_split_barrier -- time in __kmp_end_split_barrier 241 // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy 242 // KMP_icv_copy -- start/stop timer for any ICV copying 243 // KMP_linear_gather -- time in __kmp_linear_barrier_gather 244 // KMP_linear_release -- time in __kmp_linear_barrier_release 245 // KMP_tree_gather -- time in __kmp_tree_barrier_gather 246 // KMP_tree_release -- time in __kmp_tree_barrier_release 247 // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather 248 // KMP_hyper_release -- time in __kmp_hyper_barrier_release 249 // clang-format off 250 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \ 251 macro(KMP_fork_call, 0, arg) \ 252 macro(KMP_join_call, 0, arg) \ 253 macro(KMP_end_split_barrier, 0, arg) \ 254 macro(KMP_hier_gather, 0, arg) \ 255 macro(KMP_hier_release, 0, arg) \ 256 macro(KMP_hyper_gather, 0, arg) \ 257 macro(KMP_hyper_release, 0, arg) \ 258 macro(KMP_linear_gather, 0, arg) \ 259 macro(KMP_linear_release, 0, arg) \ 260 macro(KMP_tree_gather, 0, arg) \ 261 macro(KMP_tree_release, 0, arg) \ 262 macro(USER_resume, 0, arg) \ 263 macro(USER_suspend, 0, arg) \ 264 macro(USER_mwait, 0, arg) \ 265 macro(KMP_allocate_team, 0, arg) \ 266 macro(KMP_setup_icv_copy, 0, arg) \ 267 macro(USER_icv_copy, 0, arg) \ 268 macro (FOR_static_steal_stolen, \ 269 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 270 macro (FOR_static_steal_chunks, \ 271 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) 272 #else 273 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) 274 #endif 275 // clang-format on 276 277 /*! 278 * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro. 279 * 280 * @param macro a user defined macro that takes three arguments - 281 * macro(TIMER_NAME, flags, arg) 282 * @param arg a user defined argument to send to the user defined macro 283 * 284 * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE 285 * BAD THINGS WILL HAPPEN! 286 * 287 * \details Explicit timers are ones where we need to allocate a timer itself 288 * (as well as the accumulated timing statistics). We allocate these on a 289 * per-thread basis, and explicitly start and stop them. Block timers just 290 * allocate the timer itself on the stack, and use the destructor to notice 291 * block exit; they don't need to be defined here. The name here should be the 292 * same as that of a timer above. 293 * 294 * @ingroup STATS_GATHERING 295 */ 296 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg) 297 298 #define ENUMERATE(name, ignore, prefix) prefix##name, 299 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST }; 300 301 enum explicit_timer_e { 302 KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST 303 }; 304 305 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST }; 306 #undef ENUMERATE 307 308 /* 309 * A logarithmic histogram. It accumulates the number of values in each power of 310 * ten bin. So 1<=x<10, 10<=x<100, ... 311 * Mostly useful where we have some big outliers and want to see information 312 * about them. 313 */ 314 class logHistogram { 315 enum { 316 numBins = 31, /* Number of powers of 10. If this changes you need to change 317 * the initializer for binMax */ 318 319 /* 320 * If you want to use this to analyse values that may be less than 1, (for 321 * instance times in s), then the logOffset gives you negative powers. 322 * In our case here, we're just looking at times in ticks, or counts, so we 323 * can never see values with magnitude < 1 (other than zero), so we can set 324 * it to 0. As above change the initializer if you change this. 325 */ 326 logOffset = 0 327 }; 328 uint32_t KMP_ALIGN_CACHE zeroCount; 329 struct { 330 uint32_t count; 331 double total; 332 } bins[numBins]; 333 334 static double binMax[numBins]; 335 336 #ifdef KMP_DEBUG 337 uint64_t _total; 338 339 void check() const { 340 uint64_t t = zeroCount; 341 for (int i = 0; i < numBins; i++) 342 t += bins[i].count; 343 KMP_DEBUG_ASSERT(t == _total); 344 } 345 #else 346 void check() const {} 347 #endif 348 349 public: 350 logHistogram() { reset(); } 351 352 logHistogram(logHistogram const &o) { 353 for (int i = 0; i < numBins; i++) 354 bins[i] = o.bins[i]; 355 #ifdef KMP_DEBUG 356 _total = o._total; 357 #endif 358 } 359 360 void reset() { 361 zeroCount = 0; 362 for (int i = 0; i < numBins; i++) { 363 bins[i].count = 0; 364 bins[i].total = 0; 365 } 366 367 #ifdef KMP_DEBUG 368 _total = 0; 369 #endif 370 } 371 uint32_t count(int b) const { return bins[b + logOffset].count; } 372 double total(int b) const { return bins[b + logOffset].total; } 373 static uint32_t findBin(double sample); 374 375 logHistogram &operator+=(logHistogram const &o) { 376 zeroCount += o.zeroCount; 377 for (int i = 0; i < numBins; i++) { 378 bins[i].count += o.bins[i].count; 379 bins[i].total += o.bins[i].total; 380 } 381 #ifdef KMP_DEBUG 382 _total += o._total; 383 check(); 384 #endif 385 386 return *this; 387 } 388 389 void addSample(double sample); 390 int minBin() const; 391 int maxBin() const; 392 393 std::string format(char) const; 394 }; 395 396 class statistic { 397 double KMP_ALIGN_CACHE minVal; 398 double maxVal; 399 double meanVal; 400 double m2; 401 uint64_t sampleCount; 402 double offset; 403 bool collectingHist; 404 logHistogram hist; 405 406 public: 407 statistic(bool doHist = bool(KMP_STATS_HIST)) { 408 reset(); 409 collectingHist = doHist; 410 } 411 statistic(statistic const &o) 412 : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2), 413 sampleCount(o.sampleCount), offset(o.offset), 414 collectingHist(o.collectingHist), hist(o.hist) {} 415 statistic(double minv, double maxv, double meanv, uint64_t sc, double sd) 416 : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc), 417 sampleCount(sc), offset(0.0), collectingHist(false) {} 418 bool haveHist() const { return collectingHist; } 419 double getMin() const { return minVal; } 420 double getMean() const { return meanVal; } 421 double getMax() const { return maxVal; } 422 uint64_t getCount() const { return sampleCount; } 423 double getSD() const { return sqrt(m2 / sampleCount); } 424 double getTotal() const { return sampleCount * meanVal; } 425 logHistogram const *getHist() const { return &hist; } 426 void setOffset(double d) { offset = d; } 427 428 void reset() { 429 minVal = (std::numeric_limits<double>::max)(); 430 maxVal = -minVal; 431 meanVal = 0.0; 432 m2 = 0.0; 433 sampleCount = 0; 434 offset = 0.0; 435 hist.reset(); 436 } 437 void addSample(double sample); 438 void scale(double factor); 439 void scaleDown(double f) { scale(1. / f); } 440 void forceCount(uint64_t count) { sampleCount = count; } 441 statistic &operator+=(statistic const &other); 442 443 std::string format(char unit, bool total = false) const; 444 std::string formatHist(char unit) const { return hist.format(unit); } 445 }; 446 447 struct statInfo { 448 const char *name; 449 uint32_t flags; 450 }; 451 452 class timeStat : public statistic { 453 static statInfo timerInfo[]; 454 455 public: 456 timeStat() : statistic() {} 457 static const char *name(timer_e e) { return timerInfo[e].name; } 458 static bool noTotal(timer_e e) { 459 return timerInfo[e].flags & stats_flags_e::noTotal; 460 } 461 static bool masterOnly(timer_e e) { 462 return timerInfo[e].flags & stats_flags_e::onlyInMaster; 463 } 464 static bool workerOnly(timer_e e) { 465 return timerInfo[e].flags & stats_flags_e::notInMaster; 466 } 467 static bool noUnits(timer_e e) { 468 return timerInfo[e].flags & stats_flags_e::noUnits; 469 } 470 static bool logEvent(timer_e e) { 471 return timerInfo[e].flags & stats_flags_e::logEvent; 472 } 473 static void clearEventFlags() { 474 for (int i = 0; i < TIMER_LAST; i++) { 475 timerInfo[i].flags &= (~(stats_flags_e::logEvent)); 476 } 477 } 478 }; 479 480 // Where we need explicitly to start and end the timer, this version can be used 481 // Since these timers normally aren't nicely scoped, so don't have a good place 482 // to live on the stack of the thread, they're more work to use. 483 class explicitTimer { 484 timeStat *stat; 485 timer_e timerEnumValue; 486 tsc_tick_count startTime; 487 tsc_tick_count pauseStartTime; 488 tsc_tick_count::tsc_interval_t totalPauseTime; 489 490 public: 491 explicitTimer(timeStat *s, timer_e te) 492 : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0), 493 totalPauseTime() {} 494 495 // void setStat(timeStat *s) { stat = s; } 496 void start(tsc_tick_count tick); 497 void pause(tsc_tick_count tick) { pauseStartTime = tick; } 498 void resume(tsc_tick_count tick) { 499 totalPauseTime += (tick - pauseStartTime); 500 } 501 void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr); 502 void reset() { 503 startTime = 0; 504 pauseStartTime = 0; 505 totalPauseTime = 0; 506 } 507 timer_e get_type() const { return timerEnumValue; } 508 }; 509 510 // Where you need to partition a threads clock ticks into separate states 511 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and 512 // DOING_NOTHING would render these conditions: 513 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive 514 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice 515 // versa 516 class partitionedTimers { 517 private: 518 std::vector<explicitTimer> timer_stack; 519 520 public: 521 partitionedTimers(); 522 void init(explicitTimer timer); 523 void exchange(explicitTimer timer); 524 void push(explicitTimer timer); 525 void pop(); 526 void windup(); 527 }; 528 529 // Special wrapper around the partitioned timers to aid timing code blocks 530 // It avoids the need to have an explicit end, leaving the scope suffices. 531 class blockPartitionedTimer { 532 partitionedTimers *part_timers; 533 534 public: 535 blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer) 536 : part_timers(pt) { 537 part_timers->push(timer); 538 } 539 ~blockPartitionedTimer() { part_timers->pop(); } 540 }; 541 542 // Special wrapper around the thread state to aid in keeping state in code 543 // blocks It avoids the need to have an explicit end, leaving the scope 544 // suffices. 545 class blockThreadState { 546 stats_state_e *state_pointer; 547 stats_state_e old_state; 548 549 public: 550 blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state) 551 : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) { 552 *state_pointer = new_state; 553 } 554 ~blockThreadState() { *state_pointer = old_state; } 555 }; 556 557 // If all you want is a count, then you can use this... 558 // The individual per-thread counts will be aggregated into a statistic at 559 // program exit. 560 class counter { 561 uint64_t value; 562 static const statInfo counterInfo[]; 563 564 public: 565 counter() : value(0) {} 566 void increment() { value++; } 567 uint64_t getValue() const { return value; } 568 void reset() { value = 0; } 569 static const char *name(counter_e e) { return counterInfo[e].name; } 570 static bool masterOnly(counter_e e) { 571 return counterInfo[e].flags & stats_flags_e::onlyInMaster; 572 } 573 }; 574 575 /* **************************************************************** 576 Class to implement an event 577 578 There are four components to an event: start time, stop time 579 nest_level, and timer_name. 580 The start and stop time should be obvious (recorded in clock ticks). 581 The nest_level relates to the bar width in the timeline graph. 582 The timer_name is used to determine which timer event triggered this event. 583 584 the interface to this class is through four read-only operations: 585 1) getStart() -- returns the start time as 64 bit integer 586 2) getStop() -- returns the stop time as 64 bit integer 587 3) getNestLevel() -- returns the nest level of the event 588 4) getTimerName() -- returns the timer name that triggered event 589 590 *MORE ON NEST_LEVEL* 591 The nest level is used in the bar graph that represents the timeline. 592 Its main purpose is for showing how events are nested inside eachother. 593 For example, say events, A, B, and C are recorded. If the timeline 594 looks like this: 595 596 Begin -------------------------------------------------------------> Time 597 | | | | | | 598 A B C C B A 599 start start start end end end 600 601 Then A, B, C will have a nest level of 1, 2, 3 respectively. 602 These values are then used to calculate the barwidth so you can 603 see that inside A, B has occurred, and inside B, C has occurred. 604 Currently, this is shown with A's bar width being larger than B's 605 bar width, and B's bar width being larger than C's bar width. 606 607 **************************************************************** */ 608 class kmp_stats_event { 609 uint64_t start; 610 uint64_t stop; 611 int nest_level; 612 timer_e timer_name; 613 614 public: 615 kmp_stats_event() 616 : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {} 617 kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme) 618 : start(strt), stop(stp), nest_level(nst), timer_name(nme) {} 619 inline uint64_t getStart() const { return start; } 620 inline uint64_t getStop() const { return stop; } 621 inline int getNestLevel() const { return nest_level; } 622 inline timer_e getTimerName() const { return timer_name; } 623 }; 624 625 /* **************************************************************** 626 Class to implement a dynamically expandable array of events 627 628 --------------------------------------------------------- 629 | event 1 | event 2 | event 3 | event 4 | ... | event N | 630 --------------------------------------------------------- 631 632 An event is pushed onto the back of this array at every 633 explicitTimer->stop() call. The event records the thread #, 634 start time, stop time, and nest level related to the bar width. 635 636 The event vector starts at size INIT_SIZE and grows (doubles in size) 637 if needed. An implication of this behavior is that log(N) 638 reallocations are needed (where N is number of events). If you want 639 to avoid reallocations, then set INIT_SIZE to a large value. 640 641 the interface to this class is through six operations: 642 1) reset() -- sets the internal_size back to 0 but does not deallocate any 643 memory 644 2) size() -- returns the number of valid elements in the vector 645 3) push_back(start, stop, nest, timer_name) -- pushes an event onto 646 the back of the array 647 4) deallocate() -- frees all memory associated with the vector 648 5) sort() -- sorts the vector by start time 649 6) operator[index] or at(index) -- returns event reference at that index 650 **************************************************************** */ 651 class kmp_stats_event_vector { 652 kmp_stats_event *events; 653 int internal_size; 654 int allocated_size; 655 static const int INIT_SIZE = 1024; 656 657 public: 658 kmp_stats_event_vector() { 659 events = 660 (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE); 661 internal_size = 0; 662 allocated_size = INIT_SIZE; 663 } 664 ~kmp_stats_event_vector() {} 665 inline void reset() { internal_size = 0; } 666 inline int size() const { return internal_size; } 667 void push_back(uint64_t start_time, uint64_t stop_time, int nest_level, 668 timer_e name) { 669 int i; 670 if (internal_size == allocated_size) { 671 kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate( 672 sizeof(kmp_stats_event) * allocated_size * 2); 673 for (i = 0; i < internal_size; i++) 674 tmp[i] = events[i]; 675 __kmp_free(events); 676 events = tmp; 677 allocated_size *= 2; 678 } 679 events[internal_size] = 680 kmp_stats_event(start_time, stop_time, nest_level, name); 681 internal_size++; 682 return; 683 } 684 void deallocate(); 685 void sort(); 686 const kmp_stats_event &operator[](int index) const { return events[index]; } 687 kmp_stats_event &operator[](int index) { return events[index]; } 688 const kmp_stats_event &at(int index) const { return events[index]; } 689 kmp_stats_event &at(int index) { return events[index]; } 690 }; 691 692 /* **************************************************************** 693 Class to implement a doubly-linked, circular, statistics list 694 695 |---| ---> |---| ---> |---| ---> |---| ---> ... next 696 | | | | | | | | 697 |---| <--- |---| <--- |---| <--- |---| <--- ... prev 698 Sentinel first second third 699 Node node node node 700 701 The Sentinel Node is the user handle on the list. 702 The first node corresponds to thread 0's statistics. 703 The second node corresponds to thread 1's statistics and so on... 704 705 Each node has a _timers, _counters, and _explicitTimers array to hold that 706 thread's statistics. The _explicitTimers point to the correct _timer and 707 update its statistics at every stop() call. The explicitTimers' pointers are 708 set up in the constructor. Each node also has an event vector to hold that 709 thread's timing events. The event vector expands as necessary and records 710 the start-stop times for each timer. 711 712 The nestLevel variable is for plotting events and is related 713 to the bar width in the timeline graph. 714 715 Every thread will have a thread local pointer to its node in 716 the list. The sentinel node is used by the primary thread to 717 store "dummy" statistics before __kmp_create_worker() is called. 718 **************************************************************** */ 719 class kmp_stats_list { 720 int gtid; 721 timeStat _timers[TIMER_LAST + 1]; 722 counter _counters[COUNTER_LAST + 1]; 723 explicitTimer thread_life_timer; 724 partitionedTimers _partitionedTimers; 725 int _nestLevel; // one per thread 726 kmp_stats_event_vector _event_vector; 727 kmp_stats_list *next; 728 kmp_stats_list *prev; 729 stats_state_e state; 730 int thread_is_idle_flag; 731 732 public: 733 kmp_stats_list() 734 : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life], 735 TIMER_OMP_worker_thread_life), 736 _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE), 737 thread_is_idle_flag(0) {} 738 ~kmp_stats_list() {} 739 inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; } 740 inline counter *getCounter(counter_e idx) { return &_counters[idx]; } 741 inline partitionedTimers *getPartitionedTimers() { 742 return &_partitionedTimers; 743 } 744 inline timeStat *getTimers() { return _timers; } 745 inline counter *getCounters() { return _counters; } 746 inline kmp_stats_event_vector &getEventVector() { return _event_vector; } 747 inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); } 748 inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); } 749 inline void resetEventVector() { _event_vector.reset(); } 750 inline void incrementNestValue() { _nestLevel++; } 751 inline int getNestValue() { return _nestLevel; } 752 inline void decrementNestValue() { _nestLevel--; } 753 inline int getGtid() const { return gtid; } 754 inline void setGtid(int newgtid) { gtid = newgtid; } 755 inline void setState(stats_state_e newstate) { state = newstate; } 756 inline stats_state_e getState() const { return state; } 757 inline stats_state_e *getStatePointer() { return &state; } 758 inline bool isIdle() { return thread_is_idle_flag == 1; } 759 inline void setIdleFlag() { thread_is_idle_flag = 1; } 760 inline void resetIdleFlag() { thread_is_idle_flag = 0; } 761 kmp_stats_list *push_back(int gtid); // returns newly created list node 762 inline void push_event(uint64_t start_time, uint64_t stop_time, 763 int nest_level, timer_e name) { 764 _event_vector.push_back(start_time, stop_time, nest_level, name); 765 } 766 void deallocate(); 767 class iterator; 768 kmp_stats_list::iterator begin(); 769 kmp_stats_list::iterator end(); 770 int size(); 771 class iterator { 772 kmp_stats_list *ptr; 773 friend kmp_stats_list::iterator kmp_stats_list::begin(); 774 friend kmp_stats_list::iterator kmp_stats_list::end(); 775 776 public: 777 iterator(); 778 ~iterator(); 779 iterator operator++(); 780 iterator operator++(int dummy); 781 iterator operator--(); 782 iterator operator--(int dummy); 783 bool operator!=(const iterator &rhs); 784 bool operator==(const iterator &rhs); 785 kmp_stats_list *operator*() const; // dereference operator 786 }; 787 }; 788 789 /* **************************************************************** 790 Class to encapsulate all output functions and the environment variables 791 792 This module holds filenames for various outputs (normal stats, events, plot 793 file), as well as coloring information for the plot file. 794 795 The filenames and flags variables are read from environment variables. 796 These are read once by the constructor of the global variable 797 __kmp_stats_output which calls init(). 798 799 During this init() call, event flags for the timeStat::timerInfo[] global 800 array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes). 801 802 The only interface function that is public is outputStats(heading). This 803 function should print out everything it needs to, either to files or stderr, 804 depending on the environment variables described below 805 806 ENVIRONMENT VARIABLES: 807 KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this 808 file, otherwise, print to stderr 809 KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to 810 either KMP_STATS_FILE or stderr 811 KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename, 812 otherwise, the plot file is sent to "events.plt" 813 KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log 814 events 815 KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file, 816 otherwise, output is sent to "events.dat" 817 **************************************************************** */ 818 class kmp_stats_output_module { 819 820 public: 821 struct rgb_color { 822 float r; 823 float g; 824 float b; 825 }; 826 827 private: 828 std::string outputFileName; 829 static const char *eventsFileName; 830 static const char *plotFileName; 831 static int printPerThreadFlag; 832 static int printPerThreadEventsFlag; 833 static const rgb_color globalColorArray[]; 834 static rgb_color timerColorInfo[]; 835 836 void init(); 837 static void setupEventColors(); 838 static void printPloticusFile(); 839 static void printHeaderInfo(FILE *statsOut); 840 static void printTimerStats(FILE *statsOut, statistic const *theStats, 841 statistic const *totalStats); 842 static void printCounterStats(FILE *statsOut, statistic const *theStats); 843 static void printCounters(FILE *statsOut, counter const *theCounters); 844 static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents, 845 int gtid); 846 static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; } 847 static void windupExplicitTimers(); 848 bool eventPrintingEnabled() const { return printPerThreadEventsFlag; } 849 850 public: 851 kmp_stats_output_module() { init(); } 852 void outputStats(const char *heading); 853 }; 854 855 #ifdef __cplusplus 856 extern "C" { 857 #endif 858 void __kmp_stats_init(); 859 void __kmp_stats_fini(); 860 void __kmp_reset_stats(); 861 void __kmp_output_stats(const char *); 862 void __kmp_accumulate_stats_at_exit(void); 863 // thread local pointer to stats node within list 864 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr; 865 // head to stats list. 866 extern kmp_stats_list *__kmp_stats_list; 867 // lock for __kmp_stats_list 868 extern kmp_tas_lock_t __kmp_stats_lock; 869 // reference start time 870 extern tsc_tick_count __kmp_stats_start_time; 871 // interface to output 872 extern kmp_stats_output_module __kmp_stats_output; 873 874 #ifdef __cplusplus 875 } 876 #endif 877 878 // Simple, standard interfaces that drop out completely if stats aren't enabled 879 880 /*! 881 * \brief Adds value to specified timer (name). 882 * 883 * @param name timer name as specified under the KMP_FOREACH_TIMER() macro 884 * @param value double precision sample value to add to statistics for the timer 885 * 886 * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to 887 * a timer statistics. 888 * 889 * @ingroup STATS_GATHERING 890 */ 891 #define KMP_COUNT_VALUE(name, value) \ 892 __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value) 893 894 /*! 895 * \brief Increments specified counter (name). 896 * 897 * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro 898 * 899 * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics 900 * counter for the executing thread. 901 * 902 * @ingroup STATS_GATHERING 903 */ 904 #define KMP_COUNT_BLOCK(name) \ 905 __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment() 906 907 /*! 908 * \brief Outputs the current thread statistics and reset them. 909 * 910 * @param heading_string heading put above the final stats output 911 * 912 * \details Explicitly stops all timers and outputs all stats. Environment 913 * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a 914 * filename instead of stderr. Environment variable, 915 * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific 916 * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be 917 * defined with any value, which will print out thread specific stats, or it can 918 * be undefined (not specified in the environment) and thread specific stats 919 * won't be printed. It should be noted that all statistics are reset when this 920 * macro is called. 921 * 922 * @ingroup STATS_GATHERING 923 */ 924 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string) 925 926 /*! 927 * \brief Initializes the partitioned timers to begin with name. 928 * 929 * @param name timer which you want this thread to begin with 930 * 931 * @ingroup STATS_GATHERING 932 */ 933 #define KMP_INIT_PARTITIONED_TIMERS(name) \ 934 __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \ 935 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 936 937 #define KMP_TIME_PARTITIONED_BLOCK(name) \ 938 blockPartitionedTimer __PBLOCKTIME__( \ 939 __kmp_stats_thread_ptr->getPartitionedTimers(), \ 940 explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \ 941 TIMER_##name)) 942 943 #define KMP_PUSH_PARTITIONED_TIMER(name) \ 944 __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \ 945 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 946 947 #define KMP_POP_PARTITIONED_TIMER() \ 948 __kmp_stats_thread_ptr->getPartitionedTimers()->pop() 949 950 #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \ 951 __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \ 952 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 953 954 #define KMP_SET_THREAD_STATE(state_name) \ 955 __kmp_stats_thread_ptr->setState(state_name) 956 957 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState() 958 959 #define KMP_SET_THREAD_STATE_BLOCK(state_name) \ 960 blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \ 961 state_name) 962 963 /*! 964 * \brief resets all stats (counters to 0, timers to 0 elapsed ticks) 965 * 966 * \details Reset all stats for all threads. 967 * 968 * @ingroup STATS_GATHERING 969 */ 970 #define KMP_RESET_STATS() __kmp_reset_stats() 971 972 #if (KMP_DEVELOPER_STATS) 973 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v) 974 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n) 975 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n) 976 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n) 977 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n) 978 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \ 979 KMP_EXCHANGE_PARTITIONED_TIMER(n) 980 #else 981 // Null definitions 982 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) 983 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) 984 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) 985 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 986 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 987 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 988 #endif 989 990 #else // KMP_STATS_ENABLED 991 992 // Null definitions 993 #define KMP_COUNT_VALUE(n, v) ((void)0) 994 #define KMP_COUNT_BLOCK(n) ((void)0) 995 996 #define KMP_OUTPUT_STATS(heading_string) ((void)0) 997 #define KMP_RESET_STATS() ((void)0) 998 999 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) 1000 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) 1001 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) 1002 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 1003 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 1004 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 1005 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0) 1006 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0) 1007 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0) 1008 #define KMP_POP_PARTITIONED_TIMER() ((void)0) 1009 #define KMP_SET_THREAD_STATE(state_name) ((void)0) 1010 #define KMP_GET_THREAD_STATE() ((void)0) 1011 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0) 1012 #endif // KMP_STATS_ENABLED 1013 1014 #endif // KMP_STATS_H 1015