1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2008-2009, Intel Corporation. 23 * All Rights Reserved. 24 */ 25 26 #define MAX_TAG 8 27 #define MAX_STACK 64 28 29 #pragma D option aggsize=8m 30 #pragma D option bufsize=16m 31 #pragma D option dynvarsize=16m 32 #pragma D option aggrate=0 33 #pragma D option stackframes=MAX_STACK 34 /* 35 * Our D script needs to compile even if some of the TRANSLATE probes cannot 36 * be found. Missing probes can be caused by older kernel, different 37 * architecture, unloaded modules etc. 38 */ 39 #pragma D option zdefs 40 41 #if defined(ENABLE_SCHED) 42 #define TRACE_FILTER 43 #define TRACE_FILTER_COND(a) / (a) / 44 #else 45 #define TRACE_FILTER / pid != 0 / 46 #define TRACE_FILTER_COND(a) / pid != 0 && (a) / 47 #endif 48 49 /* Threshold to filter WAKEABLE latencies. */ 50 #define FILTER_THRESHOLD 5000000 51 /* From thread.h */ 52 #define T_WAKEABLE 2 53 54 /* 55 * This array is used to store timestamp of when threads are enqueued 56 * to dispatch queue. 57 * self-> is not accessible when enqueue happens. 58 */ 59 unsigned long long lt_timestamps[int, int]; 60 61 self unsigned int lt_is_block_wakeable; 62 self unsigned long long lt_sleep_start; 63 self unsigned long long lt_sleep_duration; 64 self unsigned long long lt_sch_delay; 65 self unsigned int lt_counter; /* only used in low overhead */ 66 self unsigned long long lt_timestamp; /* only used in low overhead */ 67 self unsigned int lt_stackp; 68 self unsigned int lt_prio[int]; 69 self string lt_cause[int]; 70 71 this unsigned int priority; 72 this string cause; 73 74 /* 75 * Clean up everything, otherwise we will run out of memory. 76 */ 77 proc:::lwp-exit 78 { 79 lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] = 0; 80 81 self->lt_sleep_start = 0; 82 self->lt_is_block_wakeable = 0; 83 self->lt_counter = 0; 84 self->lt_timestamp = 0; 85 86 /* 87 * Workaround: no way to clear associative array. 88 * We have to manually clear 0 ~ (MAX_TAG-1). 89 */ 90 91 self->lt_prio[0] = 0; 92 self->lt_prio[1] = 0; 93 self->lt_prio[2] = 0; 94 self->lt_prio[3] = 0; 95 self->lt_prio[4] = 0; 96 self->lt_prio[5] = 0; 97 self->lt_prio[6] = 0; 98 self->lt_prio[7] = 0; 99 100 self->lt_cause[0] = 0; 101 self->lt_cause[1] = 0; 102 self->lt_cause[2] = 0; 103 self->lt_cause[3] = 0; 104 self->lt_cause[4] = 0; 105 self->lt_cause[5] = 0; 106 self->lt_cause[6] = 0; 107 self->lt_cause[7] = 0; 108 } 109 110 #if !defined(ENABLE_LOW_OVERHEAD) 111 /* 112 * Log timestamp when a thread is taken off the CPU. 113 */ 114 sched::resume:off-cpu 115 TRACE_FILTER_COND(curlwpsinfo->pr_state == SSLEEP) 116 { 117 self->lt_sleep_start = timestamp; 118 self->lt_is_block_wakeable = curthread->t_flag & T_WAKEABLE; 119 120 lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] = 121 self->lt_sleep_start; 122 } 123 124 /* 125 * Log timestamp when a thread is put on a dispatch queue and becomes runnable. 126 */ 127 sched:::enqueue 128 /lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] != 0/ 129 { 130 lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] = timestamp; 131 } 132 133 /* 134 * Calculate latency when the thread is actually on the CPU. 135 * This is necessary in order to get the right stack. 136 */ 137 this unsigned long long end; 138 this unsigned long long now; 139 sched::resume:on-cpu 140 /self->lt_sleep_start != 0/ 141 { 142 this->end = lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid]; 143 this->now = timestamp; 144 lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] = 0; 145 this->end = (this->end != 0 && this->end != self->lt_sleep_start) 146 ? this->end : this->now; 147 148 self->lt_sch_delay = this->now - this->end; 149 self->lt_sleep_duration = this->end - self->lt_sleep_start; 150 self->lt_sleep_start = 0; 151 } 152 153 /* 154 * Filter: drop all "large" latency when it is interruptible, i.e., sleep() 155 * etc. 156 */ 157 #if defined(ENABLE_FILTER) 158 sched::resume:on-cpu 159 /self->lt_sleep_duration > FILTER_THRESHOLD && 160 self->lt_is_block_wakeable != 0/ 161 { 162 self->lt_sch_delay = 0; 163 self->lt_sleep_duration = 0; 164 self->lt_is_block_wakeable = 0; 165 } 166 #endif /* defined(ENABLE_FILTER) */ 167 168 /* 169 * Write sleep time to the aggregation. 170 * lt_sleep_duration is the duration between the time when a thread is taken 171 * off the CPU and the time when it is enqueued again. 172 */ 173 sched::resume:on-cpu 174 /self->lt_sleep_duration != 0/ 175 { 176 this->cause = self->lt_stackp > 0 ? 177 self->lt_cause[self->lt_stackp - 1] : ""; 178 this->priority = self->lt_stackp > 0 ? 179 self->lt_prio[self->lt_stackp - 1] : 0; 180 181 @lt_call_count[pid, tid, stack(), this->cause, 182 this->priority] = count(); 183 @lt_call_sum[pid, tid, stack(), this->cause, 184 this->priority] = sum(self->lt_sleep_duration); 185 @lt_call_max[pid, tid, stack(), this->cause, 186 this->priority] = max(self->lt_sleep_duration); 187 188 self->lt_is_block_wakeable = 0; /* Clear the flag to avoid leak */ 189 self->lt_sleep_duration = 0; 190 } 191 192 /* 193 * Write time spent in queue to the aggregation. 194 * lt_sch_delay is the interval between the time when a thread becomes 195 * runnable and the time when it is actually on the CPU. 196 */ 197 sched::resume:on-cpu 198 /self->lt_sch_delay != 0/ 199 { 200 @lt_named_count[pid, tid, "Wait for available CPU"] = count(); 201 @lt_named_sum[pid, tid, "Wait for available CPU"] = 202 sum(self->lt_sch_delay); 203 @lt_named_max[pid, tid, "Wait for available CPU"] = 204 max(self->lt_sch_delay); 205 206 self->lt_sch_delay = 0; 207 } 208 209 /* 210 * Probes to track latency caused by spinning on a lock. 211 */ 212 lockstat:::adaptive-spin 213 TRACE_FILTER 214 { 215 @lt_named_count[pid, tid, "Adapt. lock spin"] = count(); 216 @lt_named_sum[pid, tid, "Adapt. lock spin"] = sum(arg1); 217 @lt_named_max[pid, tid, "Adapt. lock spin"] = max(arg1); 218 } 219 220 lockstat:::spin-spin 221 TRACE_FILTER 222 { 223 @lt_named_count[pid, tid, "Spinlock spin"] = count(); 224 @lt_named_sum[pid, tid, "Spinlock spin"] = sum(arg1); 225 @lt_named_max[pid, tid, "Spinlock spin"] = max(arg1); 226 } 227 228 /* 229 * Probes to track latency caused by blocking on a lock. 230 */ 231 lockstat:::adaptive-block 232 TRACE_FILTER 233 { 234 @lt_named_count[pid, tid, "#Adapt. lock block"] = count(); 235 @lt_named_sum[pid, tid, "#Adapt. lock block"] = sum(arg1); 236 @lt_named_max[pid, tid, "#Adapt. lock block"] = max(arg1); 237 } 238 239 lockstat:::rw-block 240 TRACE_FILTER 241 { 242 @lt_named_count[pid, tid, "#RW. lock block"] = count(); 243 @lt_named_sum[pid, tid, "#RW. lock block"] = sum(arg1); 244 @lt_named_max[pid, tid, "#RW. lock block"] = max(arg1); 245 } 246 247 #if defined(ENABLE_SYNCOBJ) 248 /* 249 * Probes to track latency caused by synchronization objects. 250 */ 251 this int stype; 252 this unsigned long long wchan; 253 this unsigned long long wtime; 254 255 sched:::wakeup 256 /* 257 * Currently we are unable to track wakeup from sched, because all its LWP IDs 258 * are zero when we trace it and that makes lt_timestamps unusable. 259 */ 260 /args[1]->pr_pid != 0 && 261 lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] != 0/ 262 { 263 this->stype = args[0]->pr_stype; 264 this->wchan = args[0]->pr_wchan; 265 /* 266 * We can use lt_timestamps[] here, because 267 * wakeup is always fired before enqueue. 268 * After enqueue, lt_timestamps[] will be overwritten. 269 */ 270 this->wtime = timestamp - lt_timestamps[args[1]->pr_pid, 271 args[0]->pr_lwpid]; 272 273 @lt_sync_count[args[1]->pr_pid, args[0]->pr_lwpid, this->stype, 274 this->wchan] = count(); 275 @lt_sync_sum[args[1]->pr_pid, args[0]->pr_lwpid, this->stype, 276 this->wchan] = sum(this->wtime); 277 @lt_sync_max[args[1]->pr_pid, args[0]->pr_lwpid, this->stype, 278 this->wchan] = max(this->wtime); 279 } 280 #endif /* defined(ENABLE_SYNCOBJ) */ 281 282 #else /* !defined(ENABLE_LOW_OVERHEAD) */ 283 284 /* 285 * This is the low overhead mode. 286 * In order to reduce the number of instructions executed during each 287 * off-cpu and on-cpu event, we do the following: 288 * 289 * 1. Use sampling and update aggregations only roughly 1/100 times 290 * (SAMPLE_TIMES). 291 * 2. Do not track anything other than what is needed for "main" window. 292 * 3. Use as few thread local variables as possible. 293 */ 294 295 #define SAMPLE_TIMES 100 296 #define SAMPLE_THRESHOLD 50000000 297 298 /* 299 * Log timestamp when a thread is off CPU. 300 */ 301 sched::resume:off-cpu 302 TRACE_FILTER_COND(curlwpsinfo->pr_state == SSLEEP) 303 { 304 self->lt_timestamp = timestamp; 305 #if defined(ENABLE_FILTER) 306 self->lt_is_block_wakeable = curthread->t_flag & T_WAKEABLE; 307 #endif /* defined(ENABLE_FILTER) */ 308 } 309 310 /* 311 * Calculate latency when a thread is actually on the CPU. 312 */ 313 this int need_skip; 314 sched::resume:on-cpu 315 /self->lt_timestamp != 0/ 316 { 317 self->lt_timestamp = timestamp - self->lt_timestamp; 318 319 #if defined(ENABLE_FILTER) 320 self->lt_timestamp = 321 (self->lt_timestamp > FILTER_THRESHOLD && 322 self->lt_is_block_wakeable != 0) ? 0 : self->lt_timestamp; 323 self->lt_is_block_wakeable = 0; 324 #endif /* defined(ENABLE_FILTER) */ 325 326 this->need_skip = (self->lt_counter < (SAMPLE_TIMES - 1) && 327 self->lt_timestamp <= SAMPLE_THRESHOLD) ? 1 : 0; 328 self->lt_timestamp = this->need_skip ? 0 : self->lt_timestamp; 329 self->lt_counter += this->need_skip; 330 } 331 332 /* 333 * Track large latency first. 334 */ 335 sched::resume:on-cpu 336 /self->lt_timestamp > SAMPLE_THRESHOLD/ 337 { 338 this->cause = self->lt_stackp > 0 ? 339 self->lt_cause[self->lt_stackp - 1] : ""; 340 this->priority = self->lt_stackp > 0 ? 341 self->lt_prio[self->lt_stackp - 1] : 0; 342 343 @lt_call_count[pid, tid, stack(), this->cause, 344 this->priority] = sum(1); 345 @lt_call_sum[pid, tid, stack(), this->cause, 346 this->priority] = sum(self->lt_timestamp); 347 @lt_call_max[pid, tid, stack(), this->cause, 348 this->priority] = max(self->lt_timestamp); 349 350 self->lt_timestamp = 0; 351 } 352 353 /* 354 * If we fall back to this probe, that means the latency is small and counter 355 * has reached SAMPLE_TIMES. 356 */ 357 sched::resume:on-cpu 358 /self->lt_timestamp != 0/ 359 { 360 this->cause = self->lt_stackp > 0 ? 361 self->lt_cause[self->lt_stackp - 1] : ""; 362 this->priority = self->lt_stackp > 0 ? 363 self->lt_prio[self->lt_stackp - 1] : 0; 364 365 /* Need +1 because lt_counter has not been updated in this cycle. */ 366 @lt_call_count[pid, tid, stack(), this->cause, 367 this->priority] = sum(self->lt_counter + 1); 368 @lt_call_sum[pid, tid, stack(), this->cause, 369 this->priority] = sum((self->lt_counter + 1) * self->lt_timestamp); 370 @lt_call_max[pid, tid, stack(), this->cause, 371 this->priority] = max(self->lt_timestamp); 372 373 self->lt_timestamp = 0; 374 self->lt_counter = 0; 375 } 376 377 #endif /* !defined(ENABLE_LOW_OVERHEAD) */ 378 379 #define TRANSLATE(entryprobe, returnprobe, cause, priority) \ 380 entryprobe \ 381 TRACE_FILTER_COND(self->lt_stackp == 0 || \ 382 (self->lt_stackp < MAX_TAG && \ 383 self->lt_prio[self->lt_stackp - 1] <= priority) ) \ 384 { \ 385 self->lt_prio[self->lt_stackp] = priority; \ 386 self->lt_cause[self->lt_stackp] = cause; \ 387 ++self->lt_stackp; \ 388 } \ 389 returnprobe \ 390 TRACE_FILTER_COND(self->lt_stackp > 0 && \ 391 self->lt_cause[self->lt_stackp - 1] == cause) \ 392 { \ 393 --self->lt_stackp; \ 394 self->lt_cause[self->lt_stackp] = NULL; \ 395 } 396 397 /* 398 * Syscalls have a priority of 10. This is to make sure that latency is 399 * traced to one of the syscalls only if nothing else matches. 400 * We put this special probe here because it uses "probefunc" variable instead 401 * of a constant string. 402 */ 403 404 TRANSLATE(syscall:::entry, syscall:::return, probefunc, 10) 405