1 #ifndef KMP_STATS_H 2 #define KMP_STATS_H 3 4 /** @file kmp_stats.h 5 * Functions for collecting statistics. 6 */ 7 8 //===----------------------------------------------------------------------===// 9 // 10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 11 // See https://llvm.org/LICENSE.txt for license information. 12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "kmp_config.h" 17 #include "kmp_debug.h" 18 19 #if KMP_STATS_ENABLED 20 /* Statistics accumulator. 21 Accumulates number of samples and computes min, max, mean, standard deviation 22 on the fly. 23 24 Online variance calculation algorithm from 25 http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm 26 */ 27 28 #include "kmp_stats_timing.h" 29 #include <limits> 30 #include <math.h> 31 #include <new> // placement new 32 #include <stdint.h> 33 #include <string> 34 #include <vector> 35 36 /* Enable developer statistics here if you want them. They are more detailed 37 than is useful for application characterisation and are intended for the 38 runtime library developer. */ 39 #define KMP_DEVELOPER_STATS 0 40 41 /* Enable/Disable histogram output */ 42 #define KMP_STATS_HIST 0 43 44 /*! 45 * @ingroup STATS_GATHERING 46 * \brief flags to describe the statistic (timer or counter) 47 * 48 */ 49 enum stats_flags_e { 50 noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic 51 onlyInMaster = 1 << 1, //!< statistic is valid only for primary thread 52 noUnits = 1 << 2, //!< statistic doesn't need units printed next to it 53 notInMaster = 1 << 3, //!< statistic is valid only for non-primary threads 54 logEvent = 1 << 4 //!< statistic can be logged on the event timeline when 55 //! KMP_STATS_EVENTS is on (valid only for timers) 56 }; 57 58 /*! 59 * @ingroup STATS_GATHERING 60 * \brief the states which a thread can be in 61 * 62 */ 63 enum stats_state_e { 64 IDLE, 65 SERIAL_REGION, 66 FORK_JOIN_BARRIER, 67 PLAIN_BARRIER, 68 TASKWAIT, 69 TASKYIELD, 70 TASKGROUP, 71 IMPLICIT_TASK, 72 EXPLICIT_TASK, 73 TEAMS_REGION 74 }; 75 76 /*! 77 * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h 78 * 79 * @param macro a user defined macro that takes three arguments - 80 * macro(COUNTER_NAME, flags, arg) 81 * @param arg a user defined argument to send to the user defined macro 82 * 83 * \details A counter counts the occurrence of some event. Each thread 84 * accumulates its own count, at the end of execution the counts are aggregated 85 * treating each thread as a separate measurement. (Unless onlyInMaster is set, 86 * in which case there's only a single measurement). The min,mean,max are 87 * therefore the values for the threads. Adding the counter here and then 88 * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you 89 * need to do. All of the tables and printing is generated from this macro. 90 * Format is "macro(name, flags, arg)" 91 * 92 * @ingroup STATS_GATHERING 93 */ 94 // clang-format off 95 #define KMP_FOREACH_COUNTER(macro, arg) \ 96 macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \ 97 macro(OMP_NESTED_PARALLEL, 0, arg) \ 98 macro(OMP_LOOP_STATIC, 0, arg) \ 99 macro(OMP_LOOP_STATIC_STEAL, 0, arg) \ 100 macro(OMP_LOOP_DYNAMIC, 0, arg) \ 101 macro(OMP_DISTRIBUTE, 0, arg) \ 102 macro(OMP_BARRIER, 0, arg) \ 103 macro(OMP_CRITICAL, 0, arg) \ 104 macro(OMP_SINGLE, 0, arg) \ 105 macro(OMP_MASTER, 0, arg) \ 106 macro(OMP_MASKED, 0, arg) \ 107 macro(OMP_TEAMS, 0, arg) \ 108 macro(OMP_set_lock, 0, arg) \ 109 macro(OMP_test_lock, 0, arg) \ 110 macro(REDUCE_wait, 0, arg) \ 111 macro(REDUCE_nowait, 0, arg) \ 112 macro(OMP_TASKYIELD, 0, arg) \ 113 macro(OMP_TASKLOOP, 0, arg) \ 114 macro(TASK_executed, 0, arg) \ 115 macro(TASK_cancelled, 0, arg) \ 116 macro(TASK_stolen, 0, arg) 117 // clang-format on 118 119 /*! 120 * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h 121 * 122 * @param macro a user defined macro that takes three arguments - 123 * macro(TIMER_NAME, flags, arg) 124 * @param arg a user defined argument to send to the user defined macro 125 * 126 * \details A timer collects multiple samples of some count in each thread and 127 * then finally aggregates all of the samples from all of the threads. For most 128 * timers the printing code also provides an aggregation over the thread totals. 129 * These are printed as TOTAL_foo. The count is normally a time (in ticks), 130 * hence the name "timer". (But can be any value, so we use this for "number of 131 * arguments passed to fork" as well). For timers the threads are not 132 * significant, it's the individual observations that count, so the statistics 133 * are at that level. Format is "macro(name, flags, arg)" 134 * 135 * @ingroup STATS_GATHERING2 136 */ 137 // clang-format off 138 #define KMP_FOREACH_TIMER(macro, arg) \ 139 macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \ 140 macro (OMP_parallel, stats_flags_e::logEvent, arg) \ 141 macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \ 142 macro (OMP_teams, stats_flags_e::logEvent, arg) \ 143 macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \ 144 macro (OMP_loop_static, 0, arg) \ 145 macro (OMP_loop_static_scheduling, 0, arg) \ 146 macro (OMP_loop_dynamic, 0, arg) \ 147 macro (OMP_loop_dynamic_scheduling, 0, arg) \ 148 macro (OMP_distribute, 0, arg) \ 149 macro (OMP_distribute_scheduling, 0, arg) \ 150 macro (OMP_critical, 0, arg) \ 151 macro (OMP_critical_wait, 0, arg) \ 152 macro (OMP_single, 0, arg) \ 153 macro (OMP_master, 0, arg) \ 154 macro (OMP_masked, 0, arg) \ 155 macro (OMP_task_immediate, 0, arg) \ 156 macro (OMP_task_taskwait, 0, arg) \ 157 macro (OMP_task_taskyield, 0, arg) \ 158 macro (OMP_task_taskgroup, 0, arg) \ 159 macro (OMP_task_join_bar, 0, arg) \ 160 macro (OMP_task_plain_bar, 0, arg) \ 161 macro (OMP_taskloop_scheduling, 0, arg) \ 162 macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \ 163 macro (OMP_idle, stats_flags_e::logEvent, arg) \ 164 macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \ 165 macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \ 166 macro (OMP_serial, stats_flags_e::logEvent, arg) \ 167 macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \ 168 arg) \ 169 macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \ 170 arg) \ 171 macro (OMP_loop_static_iterations, \ 172 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 173 macro (OMP_loop_static_total_iterations, \ 174 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 175 macro (OMP_loop_dynamic_iterations, \ 176 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 177 macro (OMP_loop_dynamic_total_iterations, \ 178 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 179 macro (OMP_distribute_iterations, \ 180 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 181 KMP_FOREACH_DEVELOPER_TIMER(macro, arg) 182 // clang-format on 183 184 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either 185 // initializing OpenMP or being created by a primary 186 // thread) until the thread is destroyed 187 // OMP_parallel -- Time thread spends executing work directly 188 // within a #pragma omp parallel 189 // OMP_parallel_overhead -- Time thread spends setting up a parallel region 190 // OMP_loop_static -- Time thread spends executing loop iterations from 191 // a statically scheduled loop 192 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations 193 // from a statically scheduled loop 194 // OMP_loop_dynamic -- Time thread spends executing loop iterations from 195 // a dynamically scheduled loop 196 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations 197 // from a dynamically scheduled loop 198 // OMP_critical -- Time thread spends executing critical section 199 // OMP_critical_wait -- Time thread spends waiting to enter 200 // a critical section 201 // OMP_single -- Time spent executing a "single" region 202 // OMP_master -- Time spent executing a "master" region 203 // OMP_masked -- Time spent executing a "masked" region 204 // OMP_task_immediate -- Time spent executing non-deferred tasks 205 // OMP_task_taskwait -- Time spent executing tasks inside a taskwait 206 // construct 207 // OMP_task_taskyield -- Time spent executing tasks inside a taskyield 208 // construct 209 // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup 210 // construct 211 // OMP_task_join_bar -- Time spent executing tasks inside a join barrier 212 // OMP_task_plain_bar -- Time spent executing tasks inside a barrier 213 // construct 214 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop 215 // construct 216 // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or 217 // inside implicit barrier at end of worksharing 218 // construct 219 // OMP_idle -- Time worker threads spend waiting for next 220 // parallel region 221 // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a 222 // parallel region 223 // OMP_join_barrier -- Time spent in a the join barrier surrounding a 224 // parallel region 225 // OMP_serial -- Time thread zero spends executing serial code 226 // OMP_set_numthreads -- Values passed to omp_set_num_threads 227 // OMP_PARALLEL_args -- Number of arguments passed to a parallel region 228 // OMP_loop_static_iterations -- Number of iterations thread is assigned for 229 // statically scheduled loops 230 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for 231 // dynamically scheduled loops 232 233 #if (KMP_DEVELOPER_STATS) 234 // Timers which are of interest to runtime library developers, not end users. 235 // These have to be explicitly enabled in addition to the other stats. 236 237 // KMP_fork_barrier -- time in __kmp_fork_barrier 238 // KMP_join_barrier -- time in __kmp_join_barrier 239 // KMP_barrier -- time in __kmp_barrier 240 // KMP_end_split_barrier -- time in __kmp_end_split_barrier 241 // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy 242 // KMP_icv_copy -- start/stop timer for any ICV copying 243 // KMP_linear_gather -- time in __kmp_linear_barrier_gather 244 // KMP_linear_release -- time in __kmp_linear_barrier_release 245 // KMP_tree_gather -- time in __kmp_tree_barrier_gather 246 // KMP_tree_release -- time in __kmp_tree_barrier_release 247 // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather 248 // KMP_hyper_release -- time in __kmp_hyper_barrier_release 249 // KMP_dist_gather -- time in __kmp_dist_barrier_gather 250 // KMP_dist_release -- time in __kmp_dist_barrier_release 251 // clang-format off 252 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \ 253 macro(KMP_fork_call, 0, arg) \ 254 macro(KMP_join_call, 0, arg) \ 255 macro(KMP_end_split_barrier, 0, arg) \ 256 macro(KMP_hier_gather, 0, arg) \ 257 macro(KMP_hier_release, 0, arg) \ 258 macro(KMP_hyper_gather, 0, arg) \ 259 macro(KMP_hyper_release, 0, arg) \ 260 macro(KMP_dist_gather, 0, arg) \ 261 macro(KMP_dist_release, 0, arg) \ 262 macro(KMP_linear_gather, 0, arg) \ 263 macro(KMP_linear_release, 0, arg) \ 264 macro(KMP_tree_gather, 0, arg) \ 265 macro(KMP_tree_release, 0, arg) \ 266 macro(USER_resume, 0, arg) \ 267 macro(USER_suspend, 0, arg) \ 268 macro(USER_mwait, 0, arg) \ 269 macro(KMP_allocate_team, 0, arg) \ 270 macro(KMP_setup_icv_copy, 0, arg) \ 271 macro(USER_icv_copy, 0, arg) \ 272 macro (FOR_static_steal_stolen, \ 273 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 274 macro (FOR_static_steal_chunks, \ 275 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) 276 #else 277 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) 278 #endif 279 // clang-format on 280 281 /*! 282 * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro. 283 * 284 * @param macro a user defined macro that takes three arguments - 285 * macro(TIMER_NAME, flags, arg) 286 * @param arg a user defined argument to send to the user defined macro 287 * 288 * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE 289 * BAD THINGS WILL HAPPEN! 290 * 291 * \details Explicit timers are ones where we need to allocate a timer itself 292 * (as well as the accumulated timing statistics). We allocate these on a 293 * per-thread basis, and explicitly start and stop them. Block timers just 294 * allocate the timer itself on the stack, and use the destructor to notice 295 * block exit; they don't need to be defined here. The name here should be the 296 * same as that of a timer above. 297 * 298 * @ingroup STATS_GATHERING 299 */ 300 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg) 301 302 #define ENUMERATE(name, ignore, prefix) prefix##name, 303 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST }; 304 305 enum explicit_timer_e { 306 KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST 307 }; 308 309 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST }; 310 #undef ENUMERATE 311 312 /* 313 * A logarithmic histogram. It accumulates the number of values in each power of 314 * ten bin. So 1<=x<10, 10<=x<100, ... 315 * Mostly useful where we have some big outliers and want to see information 316 * about them. 317 */ 318 class logHistogram { 319 enum { 320 numBins = 31, /* Number of powers of 10. If this changes you need to change 321 * the initializer for binMax */ 322 323 /* 324 * If you want to use this to analyse values that may be less than 1, (for 325 * instance times in s), then the logOffset gives you negative powers. 326 * In our case here, we're just looking at times in ticks, or counts, so we 327 * can never see values with magnitude < 1 (other than zero), so we can set 328 * it to 0. As above change the initializer if you change this. 329 */ 330 logOffset = 0 331 }; 332 uint32_t KMP_ALIGN_CACHE zeroCount; 333 struct { 334 uint32_t count; 335 double total; 336 } bins[numBins]; 337 338 static double binMax[numBins]; 339 340 #ifdef KMP_DEBUG 341 uint64_t _total; 342 343 void check() const { 344 uint64_t t = zeroCount; 345 for (int i = 0; i < numBins; i++) 346 t += bins[i].count; 347 KMP_DEBUG_ASSERT(t == _total); 348 } 349 #else 350 void check() const {} 351 #endif 352 353 public: 354 logHistogram() { reset(); } 355 356 logHistogram(logHistogram const &o) { 357 for (int i = 0; i < numBins; i++) 358 bins[i] = o.bins[i]; 359 #ifdef KMP_DEBUG 360 _total = o._total; 361 #endif 362 } 363 364 void reset() { 365 zeroCount = 0; 366 for (int i = 0; i < numBins; i++) { 367 bins[i].count = 0; 368 bins[i].total = 0; 369 } 370 371 #ifdef KMP_DEBUG 372 _total = 0; 373 #endif 374 } 375 uint32_t count(int b) const { return bins[b + logOffset].count; } 376 double total(int b) const { return bins[b + logOffset].total; } 377 static uint32_t findBin(double sample); 378 379 logHistogram &operator+=(logHistogram const &o) { 380 zeroCount += o.zeroCount; 381 for (int i = 0; i < numBins; i++) { 382 bins[i].count += o.bins[i].count; 383 bins[i].total += o.bins[i].total; 384 } 385 #ifdef KMP_DEBUG 386 _total += o._total; 387 check(); 388 #endif 389 390 return *this; 391 } 392 393 void addSample(double sample); 394 int minBin() const; 395 int maxBin() const; 396 397 std::string format(char) const; 398 }; 399 400 class statistic { 401 double KMP_ALIGN_CACHE minVal; 402 double maxVal; 403 double meanVal; 404 double m2; 405 uint64_t sampleCount; 406 double offset; 407 bool collectingHist; 408 logHistogram hist; 409 410 public: 411 statistic(bool doHist = bool(KMP_STATS_HIST)) { 412 reset(); 413 collectingHist = doHist; 414 } 415 statistic(statistic const &o) 416 : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2), 417 sampleCount(o.sampleCount), offset(o.offset), 418 collectingHist(o.collectingHist), hist(o.hist) {} 419 statistic(double minv, double maxv, double meanv, uint64_t sc, double sd) 420 : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc), 421 sampleCount(sc), offset(0.0), collectingHist(false) {} 422 bool haveHist() const { return collectingHist; } 423 double getMin() const { return minVal; } 424 double getMean() const { return meanVal; } 425 double getMax() const { return maxVal; } 426 uint64_t getCount() const { return sampleCount; } 427 double getSD() const { return sqrt(m2 / sampleCount); } 428 double getTotal() const { return sampleCount * meanVal; } 429 logHistogram const *getHist() const { return &hist; } 430 void setOffset(double d) { offset = d; } 431 432 void reset() { 433 minVal = (std::numeric_limits<double>::max)(); 434 maxVal = -minVal; 435 meanVal = 0.0; 436 m2 = 0.0; 437 sampleCount = 0; 438 offset = 0.0; 439 hist.reset(); 440 } 441 void addSample(double sample); 442 void scale(double factor); 443 void scaleDown(double f) { scale(1. / f); } 444 void forceCount(uint64_t count) { sampleCount = count; } 445 statistic &operator+=(statistic const &other); 446 447 std::string format(char unit, bool total = false) const; 448 std::string formatHist(char unit) const { return hist.format(unit); } 449 }; 450 451 struct statInfo { 452 const char *name; 453 uint32_t flags; 454 }; 455 456 class timeStat : public statistic { 457 static statInfo timerInfo[]; 458 459 public: 460 timeStat() : statistic() {} 461 static const char *name(timer_e e) { return timerInfo[e].name; } 462 static bool noTotal(timer_e e) { 463 return timerInfo[e].flags & stats_flags_e::noTotal; 464 } 465 static bool masterOnly(timer_e e) { 466 return timerInfo[e].flags & stats_flags_e::onlyInMaster; 467 } 468 static bool workerOnly(timer_e e) { 469 return timerInfo[e].flags & stats_flags_e::notInMaster; 470 } 471 static bool noUnits(timer_e e) { 472 return timerInfo[e].flags & stats_flags_e::noUnits; 473 } 474 static bool logEvent(timer_e e) { 475 return timerInfo[e].flags & stats_flags_e::logEvent; 476 } 477 static void clearEventFlags() { 478 for (int i = 0; i < TIMER_LAST; i++) { 479 timerInfo[i].flags &= (~(stats_flags_e::logEvent)); 480 } 481 } 482 }; 483 484 // Where we need explicitly to start and end the timer, this version can be used 485 // Since these timers normally aren't nicely scoped, so don't have a good place 486 // to live on the stack of the thread, they're more work to use. 487 class explicitTimer { 488 timeStat *stat; 489 timer_e timerEnumValue; 490 tsc_tick_count startTime; 491 tsc_tick_count pauseStartTime; 492 tsc_tick_count::tsc_interval_t totalPauseTime; 493 494 public: 495 explicitTimer(timeStat *s, timer_e te) 496 : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0), 497 totalPauseTime() {} 498 499 // void setStat(timeStat *s) { stat = s; } 500 void start(tsc_tick_count tick); 501 void pause(tsc_tick_count tick) { pauseStartTime = tick; } 502 void resume(tsc_tick_count tick) { 503 totalPauseTime += (tick - pauseStartTime); 504 } 505 void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr); 506 void reset() { 507 startTime = 0; 508 pauseStartTime = 0; 509 totalPauseTime = 0; 510 } 511 timer_e get_type() const { return timerEnumValue; } 512 }; 513 514 // Where you need to partition a threads clock ticks into separate states 515 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and 516 // DOING_NOTHING would render these conditions: 517 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive 518 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice 519 // versa 520 class partitionedTimers { 521 private: 522 std::vector<explicitTimer> timer_stack; 523 524 public: 525 partitionedTimers(); 526 void init(explicitTimer timer); 527 void exchange(explicitTimer timer); 528 void push(explicitTimer timer); 529 void pop(); 530 void windup(); 531 }; 532 533 // Special wrapper around the partitioned timers to aid timing code blocks 534 // It avoids the need to have an explicit end, leaving the scope suffices. 535 class blockPartitionedTimer { 536 partitionedTimers *part_timers; 537 538 public: 539 blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer) 540 : part_timers(pt) { 541 part_timers->push(timer); 542 } 543 ~blockPartitionedTimer() { part_timers->pop(); } 544 }; 545 546 // Special wrapper around the thread state to aid in keeping state in code 547 // blocks It avoids the need to have an explicit end, leaving the scope 548 // suffices. 549 class blockThreadState { 550 stats_state_e *state_pointer; 551 stats_state_e old_state; 552 553 public: 554 blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state) 555 : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) { 556 *state_pointer = new_state; 557 } 558 ~blockThreadState() { *state_pointer = old_state; } 559 }; 560 561 // If all you want is a count, then you can use this... 562 // The individual per-thread counts will be aggregated into a statistic at 563 // program exit. 564 class counter { 565 uint64_t value; 566 static const statInfo counterInfo[]; 567 568 public: 569 counter() : value(0) {} 570 void increment() { value++; } 571 uint64_t getValue() const { return value; } 572 void reset() { value = 0; } 573 static const char *name(counter_e e) { return counterInfo[e].name; } 574 static bool masterOnly(counter_e e) { 575 return counterInfo[e].flags & stats_flags_e::onlyInMaster; 576 } 577 }; 578 579 /* **************************************************************** 580 Class to implement an event 581 582 There are four components to an event: start time, stop time 583 nest_level, and timer_name. 584 The start and stop time should be obvious (recorded in clock ticks). 585 The nest_level relates to the bar width in the timeline graph. 586 The timer_name is used to determine which timer event triggered this event. 587 588 the interface to this class is through four read-only operations: 589 1) getStart() -- returns the start time as 64 bit integer 590 2) getStop() -- returns the stop time as 64 bit integer 591 3) getNestLevel() -- returns the nest level of the event 592 4) getTimerName() -- returns the timer name that triggered event 593 594 *MORE ON NEST_LEVEL* 595 The nest level is used in the bar graph that represents the timeline. 596 Its main purpose is for showing how events are nested inside eachother. 597 For example, say events, A, B, and C are recorded. If the timeline 598 looks like this: 599 600 Begin -------------------------------------------------------------> Time 601 | | | | | | 602 A B C C B A 603 start start start end end end 604 605 Then A, B, C will have a nest level of 1, 2, 3 respectively. 606 These values are then used to calculate the barwidth so you can 607 see that inside A, B has occurred, and inside B, C has occurred. 608 Currently, this is shown with A's bar width being larger than B's 609 bar width, and B's bar width being larger than C's bar width. 610 611 **************************************************************** */ 612 class kmp_stats_event { 613 uint64_t start; 614 uint64_t stop; 615 int nest_level; 616 timer_e timer_name; 617 618 public: 619 kmp_stats_event() 620 : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {} 621 kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme) 622 : start(strt), stop(stp), nest_level(nst), timer_name(nme) {} 623 inline uint64_t getStart() const { return start; } 624 inline uint64_t getStop() const { return stop; } 625 inline int getNestLevel() const { return nest_level; } 626 inline timer_e getTimerName() const { return timer_name; } 627 }; 628 629 /* **************************************************************** 630 Class to implement a dynamically expandable array of events 631 632 --------------------------------------------------------- 633 | event 1 | event 2 | event 3 | event 4 | ... | event N | 634 --------------------------------------------------------- 635 636 An event is pushed onto the back of this array at every 637 explicitTimer->stop() call. The event records the thread #, 638 start time, stop time, and nest level related to the bar width. 639 640 The event vector starts at size INIT_SIZE and grows (doubles in size) 641 if needed. An implication of this behavior is that log(N) 642 reallocations are needed (where N is number of events). If you want 643 to avoid reallocations, then set INIT_SIZE to a large value. 644 645 the interface to this class is through six operations: 646 1) reset() -- sets the internal_size back to 0 but does not deallocate any 647 memory 648 2) size() -- returns the number of valid elements in the vector 649 3) push_back(start, stop, nest, timer_name) -- pushes an event onto 650 the back of the array 651 4) deallocate() -- frees all memory associated with the vector 652 5) sort() -- sorts the vector by start time 653 6) operator[index] or at(index) -- returns event reference at that index 654 **************************************************************** */ 655 class kmp_stats_event_vector { 656 kmp_stats_event *events; 657 int internal_size; 658 int allocated_size; 659 static const int INIT_SIZE = 1024; 660 661 public: 662 kmp_stats_event_vector() { 663 events = 664 (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE); 665 internal_size = 0; 666 allocated_size = INIT_SIZE; 667 } 668 ~kmp_stats_event_vector() {} 669 inline void reset() { internal_size = 0; } 670 inline int size() const { return internal_size; } 671 void push_back(uint64_t start_time, uint64_t stop_time, int nest_level, 672 timer_e name) { 673 int i; 674 if (internal_size == allocated_size) { 675 kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate( 676 sizeof(kmp_stats_event) * allocated_size * 2); 677 for (i = 0; i < internal_size; i++) 678 tmp[i] = events[i]; 679 __kmp_free(events); 680 events = tmp; 681 allocated_size *= 2; 682 } 683 events[internal_size] = 684 kmp_stats_event(start_time, stop_time, nest_level, name); 685 internal_size++; 686 return; 687 } 688 void deallocate(); 689 void sort(); 690 const kmp_stats_event &operator[](int index) const { return events[index]; } 691 kmp_stats_event &operator[](int index) { return events[index]; } 692 const kmp_stats_event &at(int index) const { return events[index]; } 693 kmp_stats_event &at(int index) { return events[index]; } 694 }; 695 696 /* **************************************************************** 697 Class to implement a doubly-linked, circular, statistics list 698 699 |---| ---> |---| ---> |---| ---> |---| ---> ... next 700 | | | | | | | | 701 |---| <--- |---| <--- |---| <--- |---| <--- ... prev 702 Sentinel first second third 703 Node node node node 704 705 The Sentinel Node is the user handle on the list. 706 The first node corresponds to thread 0's statistics. 707 The second node corresponds to thread 1's statistics and so on... 708 709 Each node has a _timers, _counters, and _explicitTimers array to hold that 710 thread's statistics. The _explicitTimers point to the correct _timer and 711 update its statistics at every stop() call. The explicitTimers' pointers are 712 set up in the constructor. Each node also has an event vector to hold that 713 thread's timing events. The event vector expands as necessary and records 714 the start-stop times for each timer. 715 716 The nestLevel variable is for plotting events and is related 717 to the bar width in the timeline graph. 718 719 Every thread will have a thread local pointer to its node in 720 the list. The sentinel node is used by the primary thread to 721 store "dummy" statistics before __kmp_create_worker() is called. 722 **************************************************************** */ 723 class kmp_stats_list { 724 int gtid; 725 timeStat _timers[TIMER_LAST + 1]; 726 counter _counters[COUNTER_LAST + 1]; 727 explicitTimer thread_life_timer; 728 partitionedTimers _partitionedTimers; 729 int _nestLevel; // one per thread 730 kmp_stats_event_vector _event_vector; 731 kmp_stats_list *next; 732 kmp_stats_list *prev; 733 stats_state_e state; 734 int thread_is_idle_flag; 735 736 public: 737 kmp_stats_list() 738 : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life], 739 TIMER_OMP_worker_thread_life), 740 _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE), 741 thread_is_idle_flag(0) {} 742 ~kmp_stats_list() {} 743 inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; } 744 inline counter *getCounter(counter_e idx) { return &_counters[idx]; } 745 inline partitionedTimers *getPartitionedTimers() { 746 return &_partitionedTimers; 747 } 748 inline timeStat *getTimers() { return _timers; } 749 inline counter *getCounters() { return _counters; } 750 inline kmp_stats_event_vector &getEventVector() { return _event_vector; } 751 inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); } 752 inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); } 753 inline void resetEventVector() { _event_vector.reset(); } 754 inline void incrementNestValue() { _nestLevel++; } 755 inline int getNestValue() { return _nestLevel; } 756 inline void decrementNestValue() { _nestLevel--; } 757 inline int getGtid() const { return gtid; } 758 inline void setGtid(int newgtid) { gtid = newgtid; } 759 inline void setState(stats_state_e newstate) { state = newstate; } 760 inline stats_state_e getState() const { return state; } 761 inline stats_state_e *getStatePointer() { return &state; } 762 inline bool isIdle() { return thread_is_idle_flag == 1; } 763 inline void setIdleFlag() { thread_is_idle_flag = 1; } 764 inline void resetIdleFlag() { thread_is_idle_flag = 0; } 765 kmp_stats_list *push_back(int gtid); // returns newly created list node 766 inline void push_event(uint64_t start_time, uint64_t stop_time, 767 int nest_level, timer_e name) { 768 _event_vector.push_back(start_time, stop_time, nest_level, name); 769 } 770 void deallocate(); 771 class iterator; 772 kmp_stats_list::iterator begin(); 773 kmp_stats_list::iterator end(); 774 int size(); 775 class iterator { 776 kmp_stats_list *ptr; 777 friend kmp_stats_list::iterator kmp_stats_list::begin(); 778 friend kmp_stats_list::iterator kmp_stats_list::end(); 779 780 public: 781 iterator(); 782 ~iterator(); 783 iterator operator++(); 784 iterator operator++(int dummy); 785 iterator operator--(); 786 iterator operator--(int dummy); 787 bool operator!=(const iterator &rhs); 788 bool operator==(const iterator &rhs); 789 kmp_stats_list *operator*() const; // dereference operator 790 }; 791 }; 792 793 /* **************************************************************** 794 Class to encapsulate all output functions and the environment variables 795 796 This module holds filenames for various outputs (normal stats, events, plot 797 file), as well as coloring information for the plot file. 798 799 The filenames and flags variables are read from environment variables. 800 These are read once by the constructor of the global variable 801 __kmp_stats_output which calls init(). 802 803 During this init() call, event flags for the timeStat::timerInfo[] global 804 array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes). 805 806 The only interface function that is public is outputStats(heading). This 807 function should print out everything it needs to, either to files or stderr, 808 depending on the environment variables described below 809 810 ENVIRONMENT VARIABLES: 811 KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this 812 file, otherwise, print to stderr 813 KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to 814 either KMP_STATS_FILE or stderr 815 KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename, 816 otherwise, the plot file is sent to "events.plt" 817 KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log 818 events 819 KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file, 820 otherwise, output is sent to "events.dat" 821 **************************************************************** */ 822 class kmp_stats_output_module { 823 824 public: 825 struct rgb_color { 826 float r; 827 float g; 828 float b; 829 }; 830 831 private: 832 std::string outputFileName; 833 static const char *eventsFileName; 834 static const char *plotFileName; 835 static int printPerThreadFlag; 836 static int printPerThreadEventsFlag; 837 static const rgb_color globalColorArray[]; 838 static rgb_color timerColorInfo[]; 839 840 void init(); 841 static void setupEventColors(); 842 static void printPloticusFile(); 843 static void printHeaderInfo(FILE *statsOut); 844 static void printTimerStats(FILE *statsOut, statistic const *theStats, 845 statistic const *totalStats); 846 static void printCounterStats(FILE *statsOut, statistic const *theStats); 847 static void printCounters(FILE *statsOut, counter const *theCounters); 848 static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents, 849 int gtid); 850 static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; } 851 static void windupExplicitTimers(); 852 bool eventPrintingEnabled() const { return printPerThreadEventsFlag; } 853 854 public: 855 kmp_stats_output_module() { init(); } 856 void outputStats(const char *heading); 857 }; 858 859 #ifdef __cplusplus 860 extern "C" { 861 #endif 862 void __kmp_stats_init(); 863 void __kmp_stats_fini(); 864 void __kmp_reset_stats(); 865 void __kmp_output_stats(const char *); 866 void __kmp_accumulate_stats_at_exit(void); 867 // thread local pointer to stats node within list 868 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr; 869 // head to stats list. 870 extern kmp_stats_list *__kmp_stats_list; 871 // lock for __kmp_stats_list 872 extern kmp_tas_lock_t __kmp_stats_lock; 873 // reference start time 874 extern tsc_tick_count __kmp_stats_start_time; 875 // interface to output 876 extern kmp_stats_output_module __kmp_stats_output; 877 878 #ifdef __cplusplus 879 } 880 #endif 881 882 // Simple, standard interfaces that drop out completely if stats aren't enabled 883 884 /*! 885 * \brief Adds value to specified timer (name). 886 * 887 * @param name timer name as specified under the KMP_FOREACH_TIMER() macro 888 * @param value double precision sample value to add to statistics for the timer 889 * 890 * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to 891 * a timer statistics. 892 * 893 * @ingroup STATS_GATHERING 894 */ 895 #define KMP_COUNT_VALUE(name, value) \ 896 __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value) 897 898 /*! 899 * \brief Increments specified counter (name). 900 * 901 * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro 902 * 903 * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics 904 * counter for the executing thread. 905 * 906 * @ingroup STATS_GATHERING 907 */ 908 #define KMP_COUNT_BLOCK(name) \ 909 __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment() 910 911 /*! 912 * \brief Outputs the current thread statistics and reset them. 913 * 914 * @param heading_string heading put above the final stats output 915 * 916 * \details Explicitly stops all timers and outputs all stats. Environment 917 * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a 918 * filename instead of stderr. Environment variable, 919 * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific 920 * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be 921 * defined with any value, which will print out thread specific stats, or it can 922 * be undefined (not specified in the environment) and thread specific stats 923 * won't be printed. It should be noted that all statistics are reset when this 924 * macro is called. 925 * 926 * @ingroup STATS_GATHERING 927 */ 928 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string) 929 930 /*! 931 * \brief Initializes the partitioned timers to begin with name. 932 * 933 * @param name timer which you want this thread to begin with 934 * 935 * @ingroup STATS_GATHERING 936 */ 937 #define KMP_INIT_PARTITIONED_TIMERS(name) \ 938 __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \ 939 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 940 941 #define KMP_TIME_PARTITIONED_BLOCK(name) \ 942 blockPartitionedTimer __PBLOCKTIME__( \ 943 __kmp_stats_thread_ptr->getPartitionedTimers(), \ 944 explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \ 945 TIMER_##name)) 946 947 #define KMP_PUSH_PARTITIONED_TIMER(name) \ 948 __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \ 949 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 950 951 #define KMP_POP_PARTITIONED_TIMER() \ 952 __kmp_stats_thread_ptr->getPartitionedTimers()->pop() 953 954 #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \ 955 __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \ 956 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 957 958 #define KMP_SET_THREAD_STATE(state_name) \ 959 __kmp_stats_thread_ptr->setState(state_name) 960 961 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState() 962 963 #define KMP_SET_THREAD_STATE_BLOCK(state_name) \ 964 blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \ 965 state_name) 966 967 /*! 968 * \brief resets all stats (counters to 0, timers to 0 elapsed ticks) 969 * 970 * \details Reset all stats for all threads. 971 * 972 * @ingroup STATS_GATHERING 973 */ 974 #define KMP_RESET_STATS() __kmp_reset_stats() 975 976 #if (KMP_DEVELOPER_STATS) 977 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v) 978 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n) 979 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n) 980 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n) 981 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n) 982 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \ 983 KMP_EXCHANGE_PARTITIONED_TIMER(n) 984 #else 985 // Null definitions 986 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) 987 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) 988 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) 989 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 990 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 991 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 992 #endif 993 994 #else // KMP_STATS_ENABLED 995 996 // Null definitions 997 #define KMP_COUNT_VALUE(n, v) ((void)0) 998 #define KMP_COUNT_BLOCK(n) ((void)0) 999 1000 #define KMP_OUTPUT_STATS(heading_string) ((void)0) 1001 #define KMP_RESET_STATS() ((void)0) 1002 1003 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) 1004 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) 1005 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) 1006 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 1007 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 1008 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 1009 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0) 1010 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0) 1011 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0) 1012 #define KMP_POP_PARTITIONED_TIMER() ((void)0) 1013 #define KMP_SET_THREAD_STATE(state_name) ((void)0) 1014 #define KMP_GET_THREAD_STATE() ((void)0) 1015 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0) 1016 #endif // KMP_STATS_ENABLED 1017 1018 #endif // KMP_STATS_H 1019