1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2008-2009, Intel Corporation.
23 * All Rights Reserved.
24 */
25
26#define	MAX_TAG		8
27#define	MAX_STACK	64
28
29#pragma D option aggsize=8m
30#pragma D option bufsize=16m
31#pragma D option dynvarsize=16m
32#pragma D option aggrate=0
33#pragma D option stackframes=MAX_STACK
34/*
35 * Our D script needs to compile even if some of the TRANSLATE probes cannot
36 * be found. Missing probes can be caused by older kernel, different
37 * architecture, unloaded modules etc.
38 */
39#pragma D option zdefs
40
41#if defined(ENABLE_SCHED)
42#if defined(TRACE_PID)
43#define TRACE_FILTER    / pid == 0 || pid == TRACE_PID /
44#define TRACE_FILTER_COND(a)    / (pid == 0 || pid == TRACE_PID) && (a) /
45#elif defined(TRACE_PGID)
46#define TRACE_FILTER    / pid == 0 || curpsinfo->pr_pgid == TRACE_PGID /
47#define TRACE_FILTER_COND(a)
48    / (pid == 0 || curpsinfo->pr_pgid == TRACE_PGID) && (a) /
49#else
50#define TRACE_FILTER
51#define TRACE_FILTER_COND(a)	/ (a) /
52#endif
53#else	/* ENABLE_SCHED */
54#if defined(TRACE_PID)
55#define TRACE_FILTER    / pid == TRACE_PID /
56#define TRACE_FILTER_COND(a)    / (pid == TRACE_PID) && (a) /
57#elif defined(TRACE_PGID)
58#define TRACE_FILTER    / curpsinfo->pr_pgid == TRACE_PGID /
59#define TRACE_FILTER_COND(a)    / (curpsinfo->pr_pgid == TRACE_PGID) && (a) /
60#else
61#define TRACE_FILTER	/ pid != 0 /
62#define TRACE_FILTER_COND(a)    / (pid != 0) && (a) /
63#endif
64#endif /* ENABLE_SCHED */
65
66/* Threshold to filter WAKEABLE latencies. */
67#define FILTER_THRESHOLD	5000000
68/* From thread.h */
69#define T_WAKEABLE		2
70
71/*
72 * This array is used to store timestamp of when threads are enqueued
73 * to dispatch queue.
74 * self-> is not accessible when enqueue happens.
75 */
76unsigned long long lt_timestamps[int, int];
77
78self unsigned int lt_is_block_wakeable;
79self unsigned long long lt_sleep_start;
80self unsigned long long lt_sleep_duration;
81self unsigned long long lt_sch_delay;
82self unsigned int lt_counter;		/* only used in low overhead */
83self unsigned long long lt_timestamp;	/* only used in low overhead */
84self unsigned int lt_stackp;
85self unsigned int lt_prio[int];
86self string lt_cause[int];
87
88this unsigned int priority;
89this string cause;
90
91/*
92 * Clean up everything, otherwise we will run out of memory.
93 */
94proc:::lwp-exit
95{
96	lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] = 0;
97
98	self->lt_sleep_start = 0;
99	self->lt_is_block_wakeable = 0;
100	self->lt_counter = 0;
101	self->lt_timestamp = 0;
102
103	/*
104	 * Workaround: no way to clear associative array.
105	 * We have to manually clear 0 ~ (MAX_TAG-1).
106	 */
107
108	self->lt_prio[0] = 0;
109	self->lt_prio[1] = 0;
110	self->lt_prio[2] = 0;
111	self->lt_prio[3] = 0;
112	self->lt_prio[4] = 0;
113	self->lt_prio[5] = 0;
114	self->lt_prio[6] = 0;
115	self->lt_prio[7] = 0;
116
117	self->lt_cause[0] = 0;
118	self->lt_cause[1] = 0;
119	self->lt_cause[2] = 0;
120	self->lt_cause[3] = 0;
121	self->lt_cause[4] = 0;
122	self->lt_cause[5] = 0;
123	self->lt_cause[6] = 0;
124	self->lt_cause[7] = 0;
125}
126
127#if !defined(ENABLE_LOW_OVERHEAD)
128/*
129 * Log timestamp when a thread is taken off the CPU.
130 */
131sched::resume:off-cpu
132TRACE_FILTER_COND(curlwpsinfo->pr_state == SSLEEP)
133{
134	self->lt_sleep_start = timestamp;
135	self->lt_is_block_wakeable = curthread->t_flag & T_WAKEABLE;
136
137	lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] =
138	    self->lt_sleep_start;
139}
140
141/*
142 * Log timestamp when a thread is put on a dispatch queue and becomes runnable.
143 */
144sched:::enqueue
145/lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] != 0/
146{
147	lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] = timestamp;
148}
149
150/*
151 * Calculate latency when the thread is actually on the CPU.
152 * This is necessary in order to get the right stack.
153 */
154this unsigned long long end;
155this unsigned long long now;
156sched::resume:on-cpu
157/self->lt_sleep_start != 0/
158{
159	this->end = lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid];
160	this->now = timestamp;
161	lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] = 0;
162	this->end = (this->end != 0 && this->end != self->lt_sleep_start)
163	    ? this->end : this->now;
164
165	self->lt_sch_delay = this->now - this->end;
166	self->lt_sleep_duration = this->end - self->lt_sleep_start;
167	self->lt_sleep_start = 0;
168}
169
170/*
171 * Filter: drop all "large" latency when it is interruptible, i.e., sleep()
172 * etc.
173 */
174#if defined(ENABLE_FILTER)
175sched::resume:on-cpu
176/self->lt_sleep_duration > FILTER_THRESHOLD &&
177  self->lt_is_block_wakeable != 0/
178{
179	self->lt_sch_delay = 0;
180	self->lt_sleep_duration = 0;
181	self->lt_is_block_wakeable = 0;
182}
183#endif /* defined(ENABLE_FILTER) */
184
185/*
186 * Write sleep time to the aggregation.
187 * lt_sleep_duration is the duration between the time when a thread is taken
188 * off the CPU and the time when it is enqueued again.
189 */
190sched::resume:on-cpu
191/self->lt_sleep_duration != 0/
192{
193	this->cause = self->lt_stackp > 0 ?
194	    self->lt_cause[self->lt_stackp - 1] : "";
195	this->priority = self->lt_stackp > 0 ?
196	    self->lt_prio[self->lt_stackp - 1] : 0;
197
198	@lt_call_count[pid, tid, stack(), this->cause,
199	    this->priority] = count();
200	@lt_call_sum[pid, tid, stack(), this->cause,
201	    this->priority] = sum(self->lt_sleep_duration);
202	@lt_call_max[pid, tid, stack(),  this->cause,
203	    this->priority] = max(self->lt_sleep_duration);
204
205	self->lt_is_block_wakeable = 0;	/* Clear the flag to avoid leak */
206	self->lt_sleep_duration = 0;
207}
208
209/*
210 * Write time spent in queue to the aggregation.
211 * lt_sch_delay is the interval between the time when a thread becomes
212 * runnable and the time when it is actually on the CPU.
213 */
214sched::resume:on-cpu
215/self->lt_sch_delay != 0/
216{
217	@lt_named_count[pid, tid, "Wait for available CPU"] = count();
218	@lt_named_sum[pid, tid, "Wait for available CPU"] =
219	    sum(self->lt_sch_delay);
220	@lt_named_max[pid, tid, "Wait for available CPU"] =
221	    max(self->lt_sch_delay);
222
223	self->lt_sch_delay = 0;
224}
225
226/*
227 * Probes to track latency caused by spinning on a lock.
228 */
229lockstat:::adaptive-spin
230TRACE_FILTER
231{
232	@lt_named_count[pid, tid, "Adapt. lock spin"] = count();
233	@lt_named_sum[pid, tid, "Adapt. lock spin"] = sum(arg1);
234	@lt_named_max[pid, tid, "Adapt. lock spin"] = max(arg1);
235}
236
237lockstat:::spin-spin
238TRACE_FILTER
239{
240	@lt_named_count[pid, tid, "Spinlock spin"] = count();
241	@lt_named_sum[pid, tid, "Spinlock spin"] = sum(arg1);
242	@lt_named_max[pid, tid, "Spinlock spin"] = max(arg1);
243}
244
245/*
246 * Probes to track latency caused by blocking on a lock.
247 */
248lockstat:::adaptive-block
249TRACE_FILTER
250{
251	@lt_named_count[pid, tid, "#Adapt. lock block"] = count();
252	@lt_named_sum[pid, tid, "#Adapt. lock block"] = sum(arg1);
253	@lt_named_max[pid, tid, "#Adapt. lock block"] = max(arg1);
254}
255
256lockstat:::rw-block
257TRACE_FILTER
258{
259	@lt_named_count[pid, tid, "#RW. lock block"] = count();
260	@lt_named_sum[pid, tid, "#RW. lock block"] = sum(arg1);
261	@lt_named_max[pid, tid, "#RW. lock block"] = max(arg1);
262}
263
264#if defined(ENABLE_SYNCOBJ)
265/*
266 * Probes to track latency caused by synchronization objects.
267 */
268this int stype;
269this unsigned long long wchan;
270this unsigned long long wtime;
271
272sched:::wakeup
273/*
274 * Currently we are unable to track wakeup from sched, because all its LWP IDs
275 * are zero when we trace it and that makes lt_timestamps unusable.
276 */
277/args[1]->pr_pid != 0 &&
278    lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] != 0/
279{
280	this->stype = args[0]->pr_stype;
281	this->wchan = args[0]->pr_wchan;
282	/*
283	 * We can use lt_timestamps[] here, because
284	 * wakeup is always fired before enqueue.
285	 * After enqueue, lt_timestamps[] will be overwritten.
286	 */
287	this->wtime = timestamp - lt_timestamps[args[1]->pr_pid,
288	    args[0]->pr_lwpid];
289
290	@lt_sync_count[args[1]->pr_pid, args[0]->pr_lwpid, this->stype,
291	    this->wchan] = count();
292	@lt_sync_sum[args[1]->pr_pid, args[0]->pr_lwpid, this->stype,
293	    this->wchan] = sum(this->wtime);
294	@lt_sync_max[args[1]->pr_pid, args[0]->pr_lwpid, this->stype,
295	    this->wchan] = max(this->wtime);
296}
297#endif /* defined(ENABLE_SYNCOBJ) */
298
299#else /* !defined(ENABLE_LOW_OVERHEAD) */
300
301/*
302 * This is the low overhead mode.
303 * In order to reduce the number of instructions executed during each
304 * off-cpu and on-cpu event, we do the following:
305 *
306 *	1. Use sampling and update aggregations only roughly 1/100 times
307 *		(SAMPLE_TIMES).
308 *	2. Do not track anything other than what is needed for "main" window.
309 *	3. Use as few thread local variables as possible.
310 */
311
312#define SAMPLE_TIMES		100
313#define SAMPLE_THRESHOLD	50000000
314
315/*
316 * Log timestamp when a thread is off CPU.
317 */
318sched::resume:off-cpu
319TRACE_FILTER_COND(curlwpsinfo->pr_state == SSLEEP)
320{
321	self->lt_timestamp = timestamp;
322#if defined(ENABLE_FILTER)
323	self->lt_is_block_wakeable = curthread->t_flag & T_WAKEABLE;
324#endif /* defined(ENABLE_FILTER) */
325}
326
327/*
328 * Calculate latency when a thread is actually on the CPU.
329 */
330this int need_skip;
331sched::resume:on-cpu
332/self->lt_timestamp != 0/
333{
334	self->lt_timestamp = timestamp - self->lt_timestamp;
335
336#if defined(ENABLE_FILTER)
337	self->lt_timestamp =
338	    (self->lt_timestamp > FILTER_THRESHOLD &&
339	    self->lt_is_block_wakeable != 0) ? 0 : self->lt_timestamp;
340	self->lt_is_block_wakeable = 0;
341#endif /* defined(ENABLE_FILTER) */
342
343	this->need_skip = (self->lt_counter < (SAMPLE_TIMES - 1) &&
344	    self->lt_timestamp <= SAMPLE_THRESHOLD) ? 1 : 0;
345	self->lt_timestamp = this->need_skip ? 0 : self->lt_timestamp;
346	self->lt_counter += this->need_skip;
347}
348
349/*
350 * Track large latency first.
351 */
352sched::resume:on-cpu
353/self->lt_timestamp > SAMPLE_THRESHOLD/
354{
355	this->cause = self->lt_stackp > 0 ?
356	    self->lt_cause[self->lt_stackp - 1] : "";
357	this->priority = self->lt_stackp > 0 ?
358	    self->lt_prio[self->lt_stackp - 1] : 0;
359
360	@lt_call_count[pid, tid, stack(), this->cause,
361	    this->priority] = sum(1);
362	@lt_call_sum[pid, tid, stack(), this->cause,
363	    this->priority] = sum(self->lt_timestamp);
364	@lt_call_max[pid, tid, stack(), this->cause,
365	    this->priority] = max(self->lt_timestamp);
366
367	self->lt_timestamp = 0;
368}
369
370/*
371 * If we fall back to this probe, that means the latency is small and counter
372 * has reached SAMPLE_TIMES.
373 */
374sched::resume:on-cpu
375/self->lt_timestamp != 0/
376{
377	this->cause = self->lt_stackp > 0 ?
378	    self->lt_cause[self->lt_stackp - 1] : "";
379	this->priority = self->lt_stackp > 0 ?
380	    self->lt_prio[self->lt_stackp - 1] : 0;
381
382	/* Need +1 because lt_counter has not been updated in this cycle. */
383	@lt_call_count[pid, tid, stack(), this->cause,
384	    this->priority] = sum(self->lt_counter + 1);
385	@lt_call_sum[pid, tid, stack(), this->cause,
386	    this->priority] = sum((self->lt_counter + 1) * self->lt_timestamp);
387	@lt_call_max[pid, tid, stack(), this->cause,
388	    this->priority] = max(self->lt_timestamp);
389
390	self->lt_timestamp = 0;
391	self->lt_counter = 0;
392}
393
394#endif /* !defined(ENABLE_LOW_OVERHEAD) */
395
396#define	TRANSLATE(entryprobe, returnprobe, cause, priority)		\
397entryprobe								\
398TRACE_FILTER_COND(self->lt_stackp == 0 ||				\
399    (self->lt_stackp < MAX_TAG &&					\
400    self->lt_prio[self->lt_stackp - 1] <= priority) )			\
401{									\
402	self->lt_prio[self->lt_stackp] = priority;			\
403	self->lt_cause[self->lt_stackp] = cause;			\
404	++self->lt_stackp;						\
405}									\
406returnprobe								\
407TRACE_FILTER_COND(self->lt_stackp > 0 &&				\
408    self->lt_cause[self->lt_stackp - 1] == cause)			\
409{									\
410	--self->lt_stackp;						\
411	self->lt_cause[self->lt_stackp] = NULL;				\
412}
413
414/*
415 * Syscalls have a priority of 10. This is to make sure that latency is
416 * traced to one of the syscalls only if nothing else matches.
417 * We put this special probe here because it uses "probefunc" variable instead
418 * of a constant string.
419 */
420
421TRANSLATE(syscall:::entry, syscall:::return, probefunc, 10)
422