xref: /titanic_51/usr/src/cmd/latencytop/common/latencytop.d (revision 3b17bf656ccff642e3bb47eab92a32f02dff5e97)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008-2009, Intel Corporation.
23  * All Rights Reserved.
24  */
25 
26 #define	MAX_TAG		8
27 #define	MAX_STACK	64
28 
29 #pragma D option aggsize=8m
30 #pragma D option bufsize=16m
31 #pragma D option dynvarsize=16m
32 #pragma D option aggrate=0
33 #pragma D option stackframes=MAX_STACK
34 /*
35  * Our D script needs to compile even if some of the TRANSLATE probes cannot
36  * be found. Missing probes can be caused by older kernel, different
37  * architecture, unloaded modules etc.
38  */
39 #pragma D option zdefs
40 
41 #if defined(ENABLE_SCHED)
42 #define TRACE_FILTER
43 #define TRACE_FILTER_COND(a)	/ (a) /
44 #else
45 #define TRACE_FILTER	/ pid != 0 /
46 #define TRACE_FILTER_COND(a)	/ pid != 0 && (a) /
47 #endif
48 
49 /* Threshold to filter WAKEABLE latencies. */
50 #define FILTER_THRESHOLD	5000000
51 /* From thread.h */
52 #define T_WAKEABLE		2
53 
54 /*
55  * This array is used to store timestamp of when threads are enqueued
56  * to dispatch queue.
57  * self-> is not accessible when enqueue happens.
58  */
59 unsigned long long lt_timestamps[int, int];
60 
61 self unsigned int lt_is_block_wakeable;
62 self unsigned long long lt_sleep_start;
63 self unsigned long long lt_sleep_duration;
64 self unsigned long long lt_sch_delay;
65 self unsigned int lt_counter;		/* only used in low overhead */
66 self unsigned long long lt_timestamp;	/* only used in low overhead */
67 self unsigned int lt_stackp;
68 self unsigned int lt_prio[int];
69 self string lt_cause[int];
70 
71 this unsigned int priority;
72 this string cause;
73 
74 /*
75  * Clean up everything, otherwise we will run out of memory.
76  */
77 proc:::lwp-exit
78 {
79 	lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] = 0;
80 
81 	self->lt_sleep_start = 0;
82 	self->lt_is_block_wakeable = 0;
83 	self->lt_counter = 0;
84 	self->lt_timestamp = 0;
85 
86 	/*
87 	 * Workaround: no way to clear associative array.
88 	 * We have to manually clear 0 ~ (MAX_TAG-1).
89 	 */
90 
91 	self->lt_prio[0] = 0;
92 	self->lt_prio[1] = 0;
93 	self->lt_prio[2] = 0;
94 	self->lt_prio[3] = 0;
95 	self->lt_prio[4] = 0;
96 	self->lt_prio[5] = 0;
97 	self->lt_prio[6] = 0;
98 	self->lt_prio[7] = 0;
99 
100 	self->lt_cause[0] = 0;
101 	self->lt_cause[1] = 0;
102 	self->lt_cause[2] = 0;
103 	self->lt_cause[3] = 0;
104 	self->lt_cause[4] = 0;
105 	self->lt_cause[5] = 0;
106 	self->lt_cause[6] = 0;
107 	self->lt_cause[7] = 0;
108 }
109 
110 #if !defined(ENABLE_LOW_OVERHEAD)
111 /*
112  * Log timestamp when a thread is taken off the CPU.
113  */
114 sched::resume:off-cpu
115 TRACE_FILTER_COND(curlwpsinfo->pr_state == SSLEEP)
116 {
117 	self->lt_sleep_start = timestamp;
118 	self->lt_is_block_wakeable = curthread->t_flag & T_WAKEABLE;
119 
120 	lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] =
121 	    self->lt_sleep_start;
122 }
123 
124 /*
125  * Log timestamp when a thread is put on a dispatch queue and becomes runnable.
126  */
127 sched:::enqueue
128 /lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] != 0/
129 {
130 	lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] = timestamp;
131 }
132 
133 /*
134  * Calculate latency when the thread is actually on the CPU.
135  * This is necessary in order to get the right stack.
136  */
137 this unsigned long long end;
138 this unsigned long long now;
139 sched::resume:on-cpu
140 /self->lt_sleep_start != 0/
141 {
142 	this->end = lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid];
143 	this->now = timestamp;
144 	lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] = 0;
145 	this->end = (this->end != 0 && this->end != self->lt_sleep_start)
146 	    ? this->end : this->now;
147 
148 	self->lt_sch_delay = this->now - this->end;
149 	self->lt_sleep_duration = this->end - self->lt_sleep_start;
150 	self->lt_sleep_start = 0;
151 }
152 
153 /*
154  * Filter: drop all "large" latency when it is interruptible, i.e., sleep()
155  * etc.
156  */
157 #if defined(ENABLE_FILTER)
158 sched::resume:on-cpu
159 /self->lt_sleep_duration > FILTER_THRESHOLD &&
160   self->lt_is_block_wakeable != 0/
161 {
162 	self->lt_sch_delay = 0;
163 	self->lt_sleep_duration = 0;
164 	self->lt_is_block_wakeable = 0;
165 }
166 #endif /* defined(ENABLE_FILTER) */
167 
168 /*
169  * Write sleep time to the aggregation.
170  * lt_sleep_duration is the duration between the time when a thread is taken
171  * off the CPU and the time when it is enqueued again.
172  */
173 sched::resume:on-cpu
174 /self->lt_sleep_duration != 0/
175 {
176 	this->cause = self->lt_stackp > 0 ?
177 	    self->lt_cause[self->lt_stackp - 1] : "";
178 	this->priority = self->lt_stackp > 0 ?
179 	    self->lt_prio[self->lt_stackp - 1] : 0;
180 
181 	@lt_call_count[pid, tid, stack(), this->cause,
182 	    this->priority] = count();
183 	@lt_call_sum[pid, tid, stack(), this->cause,
184 	    this->priority] = sum(self->lt_sleep_duration);
185 	@lt_call_max[pid, tid, stack(),  this->cause,
186 	    this->priority] = max(self->lt_sleep_duration);
187 
188 	self->lt_is_block_wakeable = 0;	/* Clear the flag to avoid leak */
189 	self->lt_sleep_duration = 0;
190 }
191 
192 /*
193  * Write time spent in queue to the aggregation.
194  * lt_sch_delay is the interval between the time when a thread becomes
195  * runnable and the time when it is actually on the CPU.
196  */
197 sched::resume:on-cpu
198 /self->lt_sch_delay != 0/
199 {
200 	@lt_named_count[pid, tid, "Wait for available CPU"] = count();
201 	@lt_named_sum[pid, tid, "Wait for available CPU"] =
202 	    sum(self->lt_sch_delay);
203 	@lt_named_max[pid, tid, "Wait for available CPU"] =
204 	    max(self->lt_sch_delay);
205 
206 	self->lt_sch_delay = 0;
207 }
208 
209 /*
210  * Probes to track latency caused by spinning on a lock.
211  */
212 lockstat:::adaptive-spin
213 TRACE_FILTER
214 {
215 	@lt_named_count[pid, tid, "Adapt. lock spin"] = count();
216 	@lt_named_sum[pid, tid, "Adapt. lock spin"] = sum(arg1);
217 	@lt_named_max[pid, tid, "Adapt. lock spin"] = max(arg1);
218 }
219 
220 lockstat:::spin-spin
221 TRACE_FILTER
222 {
223 	@lt_named_count[pid, tid, "Spinlock spin"] = count();
224 	@lt_named_sum[pid, tid, "Spinlock spin"] = sum(arg1);
225 	@lt_named_max[pid, tid, "Spinlock spin"] = max(arg1);
226 }
227 
228 /*
229  * Probes to track latency caused by blocking on a lock.
230  */
231 lockstat:::adaptive-block
232 TRACE_FILTER
233 {
234 	@lt_named_count[pid, tid, "#Adapt. lock block"] = count();
235 	@lt_named_sum[pid, tid, "#Adapt. lock block"] = sum(arg1);
236 	@lt_named_max[pid, tid, "#Adapt. lock block"] = max(arg1);
237 }
238 
239 lockstat:::rw-block
240 TRACE_FILTER
241 {
242 	@lt_named_count[pid, tid, "#RW. lock block"] = count();
243 	@lt_named_sum[pid, tid, "#RW. lock block"] = sum(arg1);
244 	@lt_named_max[pid, tid, "#RW. lock block"] = max(arg1);
245 }
246 
247 #if defined(ENABLE_SYNCOBJ)
248 /*
249  * Probes to track latency caused by synchronization objects.
250  */
251 this int stype;
252 this unsigned long long wchan;
253 this unsigned long long wtime;
254 
255 sched:::wakeup
256 /*
257  * Currently we are unable to track wakeup from sched, because all its LWP IDs
258  * are zero when we trace it and that makes lt_timestamps unusable.
259  */
260 /args[1]->pr_pid != 0 &&
261     lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] != 0/
262 {
263 	this->stype = args[0]->pr_stype;
264 	this->wchan = args[0]->pr_wchan;
265 	/*
266 	 * We can use lt_timestamps[] here, because
267 	 * wakeup is always fired before enqueue.
268 	 * After enqueue, lt_timestamps[] will be overwritten.
269 	 */
270 	this->wtime = timestamp - lt_timestamps[args[1]->pr_pid,
271 	    args[0]->pr_lwpid];
272 
273 	@lt_sync_count[args[1]->pr_pid, args[0]->pr_lwpid, this->stype,
274 	    this->wchan] = count();
275 	@lt_sync_sum[args[1]->pr_pid, args[0]->pr_lwpid, this->stype,
276 	    this->wchan] = sum(this->wtime);
277 	@lt_sync_max[args[1]->pr_pid, args[0]->pr_lwpid, this->stype,
278 	    this->wchan] = max(this->wtime);
279 }
280 #endif /* defined(ENABLE_SYNCOBJ) */
281 
282 #else /* !defined(ENABLE_LOW_OVERHEAD) */
283 
284 /*
285  * This is the low overhead mode.
286  * In order to reduce the number of instructions executed during each
287  * off-cpu and on-cpu event, we do the following:
288  *
289  *	1. Use sampling and update aggregations only roughly 1/100 times
290  *		(SAMPLE_TIMES).
291  *	2. Do not track anything other than what is needed for "main" window.
292  *	3. Use as few thread local variables as possible.
293  */
294 
295 #define SAMPLE_TIMES		100
296 #define SAMPLE_THRESHOLD	50000000
297 
298 /*
299  * Log timestamp when a thread is off CPU.
300  */
301 sched::resume:off-cpu
302 TRACE_FILTER_COND(curlwpsinfo->pr_state == SSLEEP)
303 {
304 	self->lt_timestamp = timestamp;
305 #if defined(ENABLE_FILTER)
306 	self->lt_is_block_wakeable = curthread->t_flag & T_WAKEABLE;
307 #endif /* defined(ENABLE_FILTER) */
308 }
309 
310 /*
311  * Calculate latency when a thread is actually on the CPU.
312  */
313 this int need_skip;
314 sched::resume:on-cpu
315 /self->lt_timestamp != 0/
316 {
317 	self->lt_timestamp = timestamp - self->lt_timestamp;
318 
319 #if defined(ENABLE_FILTER)
320 	self->lt_timestamp =
321 	    (self->lt_timestamp > FILTER_THRESHOLD &&
322 	    self->lt_is_block_wakeable != 0) ? 0 : self->lt_timestamp;
323 	self->lt_is_block_wakeable = 0;
324 #endif /* defined(ENABLE_FILTER) */
325 
326 	this->need_skip = (self->lt_counter < (SAMPLE_TIMES - 1) &&
327 	    self->lt_timestamp <= SAMPLE_THRESHOLD) ? 1 : 0;
328 	self->lt_timestamp = this->need_skip ? 0 : self->lt_timestamp;
329 	self->lt_counter += this->need_skip;
330 }
331 
332 /*
333  * Track large latency first.
334  */
335 sched::resume:on-cpu
336 /self->lt_timestamp > SAMPLE_THRESHOLD/
337 {
338 	this->cause = self->lt_stackp > 0 ?
339 	    self->lt_cause[self->lt_stackp - 1] : "";
340 	this->priority = self->lt_stackp > 0 ?
341 	    self->lt_prio[self->lt_stackp - 1] : 0;
342 
343 	@lt_call_count[pid, tid, stack(), this->cause,
344 	    this->priority] = sum(1);
345 	@lt_call_sum[pid, tid, stack(), this->cause,
346 	    this->priority] = sum(self->lt_timestamp);
347 	@lt_call_max[pid, tid, stack(), this->cause,
348 	    this->priority] = max(self->lt_timestamp);
349 
350 	self->lt_timestamp = 0;
351 }
352 
353 /*
354  * If we fall back to this probe, that means the latency is small and counter
355  * has reached SAMPLE_TIMES.
356  */
357 sched::resume:on-cpu
358 /self->lt_timestamp != 0/
359 {
360 	this->cause = self->lt_stackp > 0 ?
361 	    self->lt_cause[self->lt_stackp - 1] : "";
362 	this->priority = self->lt_stackp > 0 ?
363 	    self->lt_prio[self->lt_stackp - 1] : 0;
364 
365 	/* Need +1 because lt_counter has not been updated in this cycle. */
366 	@lt_call_count[pid, tid, stack(), this->cause,
367 	    this->priority] = sum(self->lt_counter + 1);
368 	@lt_call_sum[pid, tid, stack(), this->cause,
369 	    this->priority] = sum((self->lt_counter + 1) * self->lt_timestamp);
370 	@lt_call_max[pid, tid, stack(), this->cause,
371 	    this->priority] = max(self->lt_timestamp);
372 
373 	self->lt_timestamp = 0;
374 	self->lt_counter = 0;
375 }
376 
377 #endif /* !defined(ENABLE_LOW_OVERHEAD) */
378 
379 #define	TRANSLATE(entryprobe, returnprobe, cause, priority)		\
380 entryprobe								\
381 TRACE_FILTER_COND(self->lt_stackp == 0 ||				\
382     (self->lt_stackp < MAX_TAG &&					\
383     self->lt_prio[self->lt_stackp - 1] <= priority) )			\
384 {									\
385 	self->lt_prio[self->lt_stackp] = priority;			\
386 	self->lt_cause[self->lt_stackp] = cause;			\
387 	++self->lt_stackp;						\
388 }									\
389 returnprobe								\
390 TRACE_FILTER_COND(self->lt_stackp > 0 &&				\
391     self->lt_cause[self->lt_stackp - 1] == cause)			\
392 {									\
393 	--self->lt_stackp;						\
394 	self->lt_cause[self->lt_stackp] = NULL;				\
395 }
396 
397 /*
398  * Syscalls have a priority of 10. This is to make sure that latency is
399  * traced to one of the syscalls only if nothing else matches.
400  * We put this special probe here because it uses "probefunc" variable instead
401  * of a constant string.
402  */
403 
404 TRANSLATE(syscall:::entry, syscall:::return, probefunc, 10)
405